diff --git a/ADA/SA/bert_analyzer.py b/ADA/SA/bert_analyzer.py index d472fcb39bc87fa626106ea9ca4298caac3e44e5..afd6b997bdb0351add4ef357562dab6c76332d8e 100644 --- a/ADA/SA/bert_analyzer.py +++ b/ADA/SA/bert_analyzer.py @@ -12,7 +12,7 @@ import shap semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml' semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml' -amazon_test_path = 'data/Amazon/amazon_camera_test.xml' +amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml' trained_model_path = 'semeval_2014_2.pt' BATCH_SIZE = 32 @@ -26,6 +26,12 @@ def loss(outputs, labels): class BertAnalyzer: + @staticmethod + def default(): + sa = BertAnalyzer() + sa.load_saved('semeval_2014.pt') + return sa + def load_saved(self, path): self.net = TDBertNet(len(polarity_indices)) self.net.load_state_dict(torch.load(path)) @@ -93,7 +99,7 @@ class BertAnalyzer: f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro') print('macro F1:', f1) - def analyze_sentence(self, text, char_from, char_to): + def get_sentiment_polarity(self, text, char_from, char_to): instance = Instance(text, char_from, char_to) tokens, tg_from, tg_to = instance.get() text, target_indices = instance.to_tensor() @@ -116,10 +122,18 @@ class BertAnalyzer: # ax.set_xticklabels(tokens, rotation=45, rotation_mode='anchor', ha='right') # plt.show() - _, pred = torch.max(outputs.data, 1) - return pred - - -sentiment_analyzer = BertAnalyzer() -sentiment_analyzer.load_saved('semeval_2014.pt') -print(sentiment_analyzer.analyze_sentence("Well built laptop with win7.", 11, 17)) \ No newline at end of file + val, pred = torch.max(outputs.data, 1) + if pred == 0: + # positive + return val + elif pred == 1: + # negative + return -val + else: + # neutral or conflicted + return 0 + + +sentiment_analyzer = BertAnalyzer.default() +sentiment_analyzer.evaluate(semeval_2014_test_path) +sentiment_analyzer.evaluate(amazon_test_path) diff --git a/ADA/SA/sentiment_analyzer.py b/ADA/SA/sentiment_analyzer.py index 54ae37ceb3f2d7eec65fc23f2346e75f930dd3d4..e4311d33f3c4fca366772a2c84d1e9580d86705f 100644 --- a/ADA/SA/sentiment_analyzer.py +++ b/ADA/SA/sentiment_analyzer.py @@ -11,6 +11,7 @@ from sklearn.feature_extraction.text import CountVectorizer import os import math + def resample_data(instances, labels): label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels} max_n_instances = max([len(v) for v in label_instances.values()]) @@ -22,6 +23,7 @@ def resample_data(instances, labels): print(len(resampled_data)) return resampled_data + class SentimentAnalyzer: expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG @@ -39,10 +41,11 @@ class SentimentAnalyzer: def expresses_sentiment(self, instances): return self.expr_clf.predict([instance.vector for instance in instances]) + semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml' semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml' amazon_train_path = 'data/Amazon/amazon_camera_train.xml' -amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml' +amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml' semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml' semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' # tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml' @@ -56,7 +59,7 @@ sa = SentimentAnalyzer() train_tree = ET.parse(train_path) train_instances = [Instance(instance) for instance in train_tree.getroot()] -train_instances = resample_data(train_instances, labels) +# train_instances = resample_data(train_instances, labels) # create and train vectorizer model vec = Vectorizer(train_instances) diff --git a/ADA/SA/vectorizer.py b/ADA/SA/vectorizer.py index 7d24e18c0db57e12f5e93ee847ebdbe57f1f4ae5..45835316f60bc7ad45a6da19f64249c4643df340 100644 --- a/ADA/SA/vectorizer.py +++ b/ADA/SA/vectorizer.py @@ -17,7 +17,7 @@ class Vectorizer: self.transformer = TfidfTransformer() # indep features: - self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2)) + self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,5)) texts = [instance.text for instance in train_instances] train_bow_vectors = self.bow_vectorizer.fit_transform(texts).toarray() train_sent_vectors = [self.sentiment_scores(instance) for instance in train_instances] diff --git a/ADA/agent.py b/ADA/agent.py index e908879510e50c82f0972d00897c179c72e91d33..10410f670b61a47f6792488f21ca832043cd21ea 100644 --- a/ADA/agent.py +++ b/ADA/agent.py @@ -5,11 +5,12 @@ from anytree import PostOrderIter import pickle from argument import * from functools import reduce - +from SA.bert_analyzer import BertAnalyzer class Agent: sentiment_threshold = 0.95 review_tokenizer = ReviewTokenizer() + bert_analyzer = BertAnalyzer.default() def __init__(self): # load classifier @@ -27,7 +28,7 @@ class Agent: return phrases # analyze sentiment - def get_sentiment(self, phrase): + def get_bayes_sentiment(self, phrase): # get classification tokens = self.review_tokenizer.tokenize_review(phrase) prob_classification = self.classifier.prob_classify(dict([token, True] for token in tokens)) @@ -35,6 +36,9 @@ class Agent: strength = (prob_classification.prob(classification) - 0.5) * 2 return strength if classification == '+' else -strength + def get_bert_sentiment(self, text, char_from, char_to): + return self.bert_analyzer.get_sentiment_polarity(text, char_from, char_to) + # remove all ancestors of node in list l def remove_ancestors(self, node, l): if node.parent != None: @@ -51,8 +55,9 @@ class Agent: while len(arguments) > 0: f = arguments.pop(0) for word in glossary[f]: - if word in phrase: - argument_matches.append(f) + matches = [(f, m.start(), m.end()) for m in re.finditer(word, phrase)] + if matches: + argument_matches += matches self.remove_ancestors(f, arguments) break return argument_matches @@ -61,17 +66,16 @@ class Agent: votes = {} vote_phrases = {} for phrase in phrases: - arguments = self.get_arguments(phrase) - sentiment = self.get_sentiment(phrase) - if abs(sentiment) > self.sentiment_threshold: - for argument in arguments: + for argument, start, end in self.get_arguments(phrase): + sentiment = self.get_bayes_sentiment(phrase) # self.get_bert_sentiment(phrase, start, end) + if abs(sentiment) > self.sentiment_threshold: if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)): votes[argument] = sentiment # what if there's two phrases with same argument? vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment} # normalize votes to 1 (+) or -1 (-) for argument in votes: votes[argument] = 1 if votes[argument] > 0 else -1 - return (votes, vote_phrases) + return votes, vote_phrases # augment votes (Definition 4.3) obtained for a single critic def augment_votes(self, votes): diff --git a/ADA/DataLoader.py b/ADA/dataloader.py similarity index 100% rename from ADA/DataLoader.py rename to ADA/dataloader.py index 1b0dc663fe8947ed2937dbff612cffd246a75280..33560bf95b0035541b24f1a8cc82611498785998 100644 --- a/ADA/DataLoader.py +++ b/ADA/dataloader.py @@ -1,7 +1,7 @@ import pandas as pd -class DataLoader: +class DataLoader: data_location = 'camera_prepared_data.tsv' reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) diff --git a/ADA/server/ios_server/views.py b/ADA/server/ios_server/views.py index b58d1508d26735dd3d9bc63e044c55420701cc43..691e6332e8b2dbcd026a1ccb9083b777a26ea4c5 100644 --- a/ADA/server/ios_server/views.py +++ b/ADA/server/ios_server/views.py @@ -4,7 +4,7 @@ import jsonpickle from django.views.decorators.csrf import csrf_exempt import sys -sys.path.append('/Users/joeloksanen/individual_project/ADA') +sys.path.append('/home/joel/individual_project/ADA') from dataloader import DataLoader from communicator import Communicator diff --git a/ADA/server/server/settings.py b/ADA/server/server/settings.py index a5931b845f9a8010f9e24472864cdc9db7699d16..f6446479f3aedf54b36354dc442c801d0a3cdf48 100644 --- a/ADA/server/server/settings.py +++ b/ADA/server/server/settings.py @@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True -ALLOWED_HOSTS = ['192.168.0.13', '146.169.222.109', '146.169.218.37'] +ALLOWED_HOSTS = ['192.168.1.104'] # Application definition