From af560174dfa9bbf68fd0e962de71f390a651b0c8 Mon Sep 17 00:00:00 2001 From: Joel Oksanen <jjo2317@ic.ac.uk> Date: Sat, 25 Apr 2020 20:35:26 +0300 Subject: [PATCH] Server can now use bert for SA, seems to be working better than Bayes SA --- ADA/server/agent/SA/bert_analyzer.py | 37 ++++++-- ADA/server/agent/SA/bert_dataset.py | 18 +++- ADA/server/agent/SA/tdbertnet.py | 4 +- ADA/server/agent/agent.py | 136 ++++++++++----------------- ADA/server/agent/communicator.py | 6 +- ADA/server/agent/review.py | 109 +++++++++++++++++++++ ADA/server/ios_server/views.py | 5 +- 7 files changed, 209 insertions(+), 106 deletions(-) create mode 100644 ADA/server/agent/review.py diff --git a/ADA/server/agent/SA/bert_analyzer.py b/ADA/server/agent/SA/bert_analyzer.py index fbb9c1b..e35e779 100644 --- a/ADA/server/agent/SA/bert_analyzer.py +++ b/ADA/server/agent/SA/bert_analyzer.py @@ -8,10 +8,10 @@ import time import numpy as np from sklearn import metrics -semeval_2014_train_path = 'agent/SA/data/SemEval-2014/Laptop_Train_v2.xml' -semeval_2014_test_path = 'agent/SA/data/SemEval-2014/Laptops_Test_Gold.xml' +semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml' +semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml' amazon_test_path = 'agent/SA/data/Amazon/annotated_amazon_laptop_reviews.xml' -trained_model_path = 'agent/SA/semeval_2014_2.pt' +trained_model_path = 'semeval_2014_2.pt' BATCH_SIZE = 32 MAX_EPOCHS = 6 @@ -22,12 +22,13 @@ loss_criterion = nn.CrossEntropyLoss() def loss(outputs, labels): return loss_criterion(outputs, labels) + class BertAnalyzer: @staticmethod def default(): sa = BertAnalyzer() - sa.load_saved('agent/SA/semeval_2014.pt') + sa.load_saved(trained_model_path) return sa def load_saved(self, path): @@ -35,8 +36,8 @@ class BertAnalyzer: self.net.load_state_dict(torch.load(path)) self.net.eval() - def train(self, dataset): - train_data = BertDataset(dataset) + def train(self, data_file): + train_data = BertDataset.from_file(data_file) train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, collate_fn=generate_batch) @@ -72,8 +73,8 @@ class BertAnalyzer: torch.save(self.net.state_dict(), trained_model_path) - def evaluate(self, dataset): - test_data = BertDataset(dataset) + def evaluate(self, data_file): + test_data = BertDataset.from_file(data_file) test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, collate_fn=generate_batch) @@ -97,6 +98,19 @@ class BertAnalyzer: f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro') print('macro F1:', f1) + def get_batch_sentiment_polarity(self, data): + dataset = BertDataset.from_data(data) + loader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=8, collate_fn=generate_batch) + + predicted = [] + with torch.no_grad(): + for texts, target_indices, _ in loader: + outputs, attentions = self.net(texts, target_indices) + batch_val, batch_pred = torch.max(outputs.data, 1) + predicted += [BertAnalyzer.get_polarity(val, pred) for val, pred in zip(batch_val, batch_pred)] + + return predicted + def get_sentiment_polarity(self, text, char_from, char_to): instance = Instance(text, char_from, char_to) tokens, tg_from, tg_to = instance.get() @@ -121,6 +135,10 @@ class BertAnalyzer: # plt.show() val, pred = torch.max(outputs.data, 1) + return BertAnalyzer.get_polarity(val, pred) + + @staticmethod + def get_polarity(val, pred): if pred == 0: # positive return val @@ -129,5 +147,4 @@ class BertAnalyzer: return -val else: # neutral or conflicted - return 0 - + return 0 \ No newline at end of file diff --git a/ADA/server/agent/SA/bert_dataset.py b/ADA/server/agent/SA/bert_dataset.py index 9a754d3..c507c57 100644 --- a/ADA/server/agent/SA/bert_dataset.py +++ b/ADA/server/agent/SA/bert_dataset.py @@ -40,11 +40,14 @@ def polarity_index(polarity): class BertDataset(Dataset): - def __init__(self, xml_file): - tree = ET.parse(xml_file) - + def __init__(self): self.data = [] + @staticmethod + def from_file(file): + dataset = BertDataset() + tree = ET.parse(file) + dataset.data = [] for sentence in tree.getroot(): text = sentence.find('text').text aspect_terms = sentence.find('aspectTerms') @@ -53,7 +56,14 @@ class BertDataset(Dataset): char_from = int(term.attrib['from']) char_to = int(term.attrib['to']) polarity = term.attrib['polarity'] - self.data.append((Instance(text, char_from, char_to), polarity)) + dataset.data.append((Instance(text, char_from, char_to), polarity)) + return dataset + + @staticmethod + def from_data(data): + dataset = BertDataset() + dataset.data = [(Instance(text, char_from, char_to), 'neutral') for text, char_from, char_to in data] + return dataset def __len__(self): return len(self.data) diff --git a/ADA/server/agent/SA/tdbertnet.py b/ADA/server/agent/SA/tdbertnet.py index d3db489..6a84b9b 100644 --- a/ADA/server/agent/SA/tdbertnet.py +++ b/ADA/server/agent/SA/tdbertnet.py @@ -22,7 +22,7 @@ class TDBertNet(nn.Module): # max pooling at target locations target_outputs = torch.gather(bert_output, dim=1, index=target_indices) pooled_output = torch.max(target_outputs, dim=1)[0] - # fc layer - x = self.fc(pooled_output) + # fc layer with softmax activation + x = F.softmax(self.fc(pooled_output), 1) return x, attentions[-1] diff --git a/ADA/server/agent/agent.py b/ADA/server/agent/agent.py index 294a773..1683729 100644 --- a/ADA/server/agent/agent.py +++ b/ADA/server/agent/agent.py @@ -1,4 +1,3 @@ -from nltk.tokenize import sent_tokenize import re from agent.review_tokenizer import ReviewTokenizer from anytree import PostOrderIter @@ -6,9 +5,11 @@ import pickle from agent.argument import * from functools import reduce from agent.SA.bert_analyzer import BertAnalyzer +from agent.review import Review + class Agent: - sentiment_threshold = 0.95 + review_tokenizer = ReviewTokenizer() bert_analyzer = BertAnalyzer.default() @@ -18,15 +19,6 @@ class Agent: self.classifier = pickle.load(f) f.close() - # extract phrases - def extract_phrases(self, review_body): - sentences = sent_tokenize(review_body) - phrases = [] - for sentence in sentences: - phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', - sentence) - return phrases - # analyze sentiment def get_bayes_sentiment(self, phrase): # get classification @@ -36,60 +28,42 @@ class Agent: strength = (prob_classification.prob(classification) - 0.5) * 2 return strength if classification == '+' else -strength - def get_bert_sentiment(self, text, char_from, char_to): - return self.bert_analyzer.get_sentiment_polarity(text, char_from, char_to) - - # remove all ancestors of node in list l - def remove_ancestors(self, node, l): - if node.parent != None: - try: - l.remove(node.parent) - except ValueError: - pass - self.remove_ancestors(node.parent, l) - - # get argument(s) that match phrase - def get_arguments(self, phrase): - argument_matches = [] - arguments = [node for node in PostOrderIter(camera)] - while len(arguments) > 0: - f = arguments.pop(0) - for word in glossary[f]: - matches = [(f, m.start(), m.end()) for m in re.finditer(word, phrase)] - if matches: - argument_matches += matches - self.remove_ancestors(f, arguments) - break - return argument_matches - - def extract_votes(self, phrases): - votes = {} - vote_phrases = {} - for phrase in phrases: - for argument, start, end in self.get_arguments(phrase): - sentiment = self.get_bayes_sentiment(phrase) # self.get_bert_sentiment(phrase, start, end) - if abs(sentiment) > self.sentiment_threshold: - if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)): - votes[argument] = sentiment # what if there's two phrases with same argument? - vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment} - # normalize votes to 1 (+) or -1 (-) - for argument in votes: - votes[argument] = 1 if votes[argument] > 0 else -1 - return votes, vote_phrases - - # augment votes (Definition 4.3) obtained for a single critic - def augment_votes(self, votes): - arguments = [node for node in PostOrderIter(camera)] - for argument in arguments: - if argument not in votes: - polar_sum = 0 - for subfeat in argument.children: - if subfeat in votes: - polar_sum += votes[subfeat] - if polar_sum != 0: - votes[argument] = 1 if polar_sum > 0 else -1 - - def get_qbaf(self, ra, review_count): + def get_bert_sentiments(self, data): + return list(self.bert_analyzer.get_batch_sentiment_polarity(data)) + + def extract_votes(self, reviews): + labelled_phrases = [(phrase.text, arg.start, arg.end) for review in reviews for phrase in review.phrases for arg + in phrase.args] + + sentiments = self.get_bert_sentiments(labelled_phrases) + + for review in reviews: + for phrase in review.phrases: + bayes_sentiment = self.get_bayes_sentiment(phrase.text) + for arg in phrase.args: + sentiment = sentiments.pop(0) + print(phrase.text) + print('arg:', arg.start, '-', arg.end) + print('bert:', sentiment) + print('bayes:', bayes_sentiment) + arg.set_sentiment(sentiment) + + @staticmethod + def get_aggregates(reviews): + ra = [] + vote_sum = {arg: 0 for arg in arguments} + vote_phrases = {arg: [] for arg in arguments} + for review in reviews: + for phrase in review.phrases: + for arg, sentiment in phrase.get_votes().items(): + vote_phrases[arg].append({'phrase': phrase.text, 'sentiment': sentiment}) + for arg, sentiment in review.get_votes().items(): + ra.append({'review_id': review.id, 'argument': arg, 'vote': sentiment}) + vote_sum[arg] += sentiment + return ra, vote_sum, vote_phrases + + @staticmethod + def get_qbaf(ra, review_count): # sums of all positive and negative votes for arguments argument_sums = {} for argument in arguments: @@ -147,31 +121,17 @@ class Agent: supporter_strengths) return strengths - def analyze_reviews(self, reviews): - # get ra - self.ra = [] - self.vote_sum = {argument: 0 for argument in arguments} - self.vote_phrases = {argument: [] for argument in arguments} - voting_reviews = 0 - review_count = 0 - for _, review in reviews.iterrows(): - review_id = review['review_id'] - review_count += 1 - phrases = self.extract_phrases(review['review_body']) - votes, vote_phrases = self.extract_votes(phrases) - self.augment_votes(votes) - voting_reviews += 1 if len(votes) > 0 else 0 - # add final vote tuples to ra with simplified polarity in {+ (true), - (false)} - for argument in votes: - self.ra.append({'review_id': review_id, 'argument': argument, 'vote': votes[argument]}) - self.vote_sum[argument] += votes[argument] - for argument in vote_phrases: - self.vote_phrases[argument].append(vote_phrases[argument]) - # only consider items that obtained votes from at least 33% of reviewers - if voting_reviews / review_count < 0.33: + def analyze_reviews(self, csv): + reviews = [Review(row) for _, row in csv.iterrows()] + # extract augmented votes + self.extract_votes(reviews) + voting_reviews = list(filter(lambda r: r.is_voting(), reviews)) + if len(voting_reviews) / len(reviews) < 0.33: print('warning: only a small fraction of reviews generated votes') + # get aggregates + ra, self.vote_sum, self.vote_phrases = Agent.get_aggregates(reviews) # get qbaf from ra - self.qbaf = self.get_qbaf(self.ra, review_count) + self.qbaf = self.get_qbaf(ra, len(reviews)) # apply gradual semantics self.strengths = self.get_strengths(self.qbaf) # print results diff --git a/ADA/server/agent/communicator.py b/ADA/server/agent/communicator.py index e09bc1c..7350eca 100644 --- a/ADA/server/agent/communicator.py +++ b/ADA/server/agent/communicator.py @@ -27,8 +27,12 @@ class Communicator: def __init__(self, dl): self.dl = dl + self.product_id = None - def set_product(self, product_id): + def has_loaded_product(self, product_id): + return self.product_id == product_id + + def load_product(self, product_id): self.product_id = product_id self.arguments = {arguments[i] : Argument(i, arguments[i].name) for i in range(len(arguments))} self.argument_nodes = arguments diff --git a/ADA/server/agent/review.py b/ADA/server/agent/review.py new file mode 100644 index 0000000..2df00c8 --- /dev/null +++ b/ADA/server/agent/review.py @@ -0,0 +1,109 @@ +import re +from nltk.tokenize import sent_tokenize +from agent.SA.bert_dataset import MAX_SEQ_LEN +from anytree import PostOrderIter +from agent.argument import * + + +class Review: + SENTIMENT_THRESHOLD = 0.95 + PHRASE_MAX_WORDS = MAX_SEQ_LEN * 0.3 + + def __init__(self, data): + self.id = data['review_id'] + self.body = data['review_body'] + self.phrases = Review.extract_phrases(self.body) + self.votes = {} + + # extract phrases + @staticmethod + def extract_phrases(review_body): + sentences = sent_tokenize(review_body) + texts = [] + for sentence in sentences: + texts += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', + sentence) + texts = filter(lambda t: len(t.split()) < Review.PHRASE_MAX_WORDS, texts) + phrases = [Phrase(text) for text in texts] + return phrases + + def get_votes(self): + for arg, sentiment in [(arg, sentiment) for phrase in self.phrases for arg, sentiment in phrase.votes.items()]: + if arg not in self.votes or abs(sentiment) > abs(self.votes[arg]): + self.votes[arg] = sentiment + # normalize + for arg in self.votes: + self.votes[arg] = 1 if self.votes[arg] > 0 else -1 + self.augment_votes() + return self.votes + + # augment votes (Definition 4.3) obtained for a single critic + def augment_votes(self): + arguments = [node for node in PostOrderIter(camera)] + for argument in arguments: + if argument not in self.votes: + polar_sum = 0 + for subfeat in argument.children: + if subfeat in self.votes: + polar_sum += self.votes[subfeat] + if polar_sum != 0: + self.votes[argument] = 1 if polar_sum > 0 else -1 + + def is_voting(self): + return len(self.votes) > 0 + + +class Phrase: + + def __init__(self, text): + self.text = text + self.args = self.get_args(text) + self.votes = {} + + # get argument(s) that match phrase + def get_args(self, phrase): + argument_matches = [] + arguments = [node for node in PostOrderIter(camera)] + while len(arguments) > 0: + f = arguments.pop(0) + for word in glossary[f]: + matches = [Arg(f, m.start(), m.end()) for m in re.finditer(word, phrase)] + if matches: + argument_matches += matches + self.remove_ancestors(f, arguments) + break + return argument_matches + + # remove all ancestors of node in list l + def remove_ancestors(self, node, l): + if node.parent != None: + try: + l.remove(node.parent) + except ValueError: + pass + self.remove_ancestors(node.parent, l) + + def add_arg(self, arg): + self.args.append(arg) + + def num_args(self): + return len(self.args) + + def get_votes(self): + for arg in self.args: + if (abs(arg.sentiment) > Review.SENTIMENT_THRESHOLD and + (arg.node not in self.votes or abs(arg.sentiment) > abs(self.votes[arg.node]))): + self.votes[arg.node] = arg.sentiment + return self.votes + + +class Arg: + + def __init__(self, node, start, end): + self.node = node + self.start = start + self.end = end + self.sentiment = None + + def set_sentiment(self, sentiment): + self.sentiment = sentiment diff --git a/ADA/server/ios_server/views.py b/ADA/server/ios_server/views.py index 1956ff6..fc17aed 100644 --- a/ADA/server/ios_server/views.py +++ b/ADA/server/ios_server/views.py @@ -20,7 +20,10 @@ def product(request): star_rating = dl.get_avg_star_rating(id) image_url = 'https://ws-na.amazon-adsystem.com/widgets/q?_encoding=UTF8&MarketPlace=US&ASIN=' + id + '&ServiceVersion=20070822&ID=AsinImage&WS=1&Format=SL250' - communicator.set_product(id) + if not communicator.has_loaded_product(id): + communicator.load_product(id) + return HttpResponse("OK") + init_message = communicator.get_init_message() class Empty: -- GitLab