From ce3937014fef529af7ef9bf0d383abcf5717987e Mon Sep 17 00:00:00 2001 From: Joel Oksanen <jjo2317@ic.ac.uk> Date: Sat, 9 May 2020 16:51:47 +0300 Subject: [PATCH] Improved integration between target_extractor and server --- ADA/server/agent/agent.py | 13 ++-- ADA/server/agent/communicator.py | 14 ++++ ADA/server/agent/dataloader.py | 3 +- ADA/server/agent/prep_data.py | 2 +- ADA/server/agent/prep_metadata.py | 7 +- ADA/server/agent/product_finder.py | 2 +- ADA/server/agent/review.py | 29 +++++-- ADA/server/agent/review_annotation.py | 2 +- ADA/server/agent/target_extraction/product.py | 7 +- .../target_extraction/target_extractor.py | 77 ++++++------------- ADA/server/agent/train_classifier.py | 2 +- ADA/server/ios_server/views.py | 1 - 12 files changed, 79 insertions(+), 80 deletions(-) diff --git a/ADA/server/agent/agent.py b/ADA/server/agent/agent.py index 3be5567..f237644 100644 --- a/ADA/server/agent/agent.py +++ b/ADA/server/agent/agent.py @@ -140,16 +140,17 @@ class Agent: return max(attacker_strengths, key=attacker_strengths.get) def liked_argument(self, argument): - return self.vote_sum[ - argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument)) + return self.vote_sum[argument] >= 0 + # self.strengths[argument] > 0.5 + # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument)) def supported_argument(self, argument): - return (self.get_strongest_supporting_subfeature(argument) is not None and - self.strengths[self.get_strongest_supporting_subfeature(argument)] > 0) + supp = self.get_strongest_supporting_subfeature(argument) + return supp is not None and self.strengths[supp] > 0 def attacked_argument(self, argument): - return (self.get_strongest_attacking_subfeature(argument) is not None and - self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0) + att = self.get_strongest_attacking_subfeature(argument) + return att is not None and self.strengths[att] > 0 def best_supporting_phrase(self, argument): phrases = {vp['phrase']: vp['sentiment'] for vp in self.supporting_phrases(argument)} diff --git a/ADA/server/agent/communicator.py b/ADA/server/agent/communicator.py index fb86eb3..b54939c 100644 --- a/ADA/server/agent/communicator.py +++ b/ADA/server/agent/communicator.py @@ -88,6 +88,20 @@ class Communicator: text += '.' args = [q_arg_node, supp_node] + if query_id == 3: + supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node) + att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node) + att_name = self.product.argument_for_node(att_node).name + text = 'The {} was considered to be poor because the {} {} poor'.format(q_arg.name, att_name, + self.was_were(att_name)) + if supp_node: + supp_name = self.product.argument_for_node(supp_node).name + text += ', although the {} {} good.'.format(supp_name, self.was_were(supp_name)) + args = [q_arg_node, att_node, supp_node] + else: + text += '.' + args = [q_arg_node, att_node] + if query_id == 4 or query_id == 5: phrase = (self.agent.best_supporting_phrase(q_arg_node) if query_id == 4 else self.agent.best_attacking_phrase(q_arg_node)) diff --git a/ADA/server/agent/dataloader.py b/ADA/server/agent/dataloader.py index 0a7cb3a..3908603 100644 --- a/ADA/server/agent/dataloader.py +++ b/ADA/server/agent/dataloader.py @@ -1,7 +1,8 @@ import pandas as pd + class DataLoader: - data_location = 'agent/amazon_data/amazon_reviews_us_Camera_v1_00.tsv' + data_location = 'agent/amazon_data/amazon_reviews_us_pc.tsv' reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) def get_reviews(self, product_id): diff --git a/ADA/server/agent/prep_data.py b/ADA/server/agent/prep_data.py index f0c7ba8..db78a30 100644 --- a/ADA/server/agent/prep_data.py +++ b/ADA/server/agent/prep_data.py @@ -1,6 +1,6 @@ import pandas as pd -data_location = 'amazon_data/amazon_reviews_us_Camera_v1_00.tsv' +data_location = 'amazon_data/amazon_reviews_us_camera.tsv' training_data_location = 'amazon_data/reviews_trained_on.tsv' output_location = 'amazon_data/camera_prepared_data.tsv' min_reviews = 50 diff --git a/ADA/server/agent/prep_metadata.py b/ADA/server/agent/prep_metadata.py index 4dd7a48..58ceaba 100644 --- a/ADA/server/agent/prep_metadata.py +++ b/ADA/server/agent/prep_metadata.py @@ -25,13 +25,10 @@ def get_df(path): pd.set_option('display.max_colwidth', None) category = 'Backpacks' -metadata = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True) -for col in metadata.columns: - print(col) -metadata = metadata[metadata['category'].apply(lambda cats: category in cats)] +metadata_iter = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000) +metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: category in cl)] for metadata in metadata_iter]) -print(metadata['category'][:5]) print(len(metadata.index)) review_iter = pd.read_json('amazon_data/Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000) diff --git a/ADA/server/agent/product_finder.py b/ADA/server/agent/product_finder.py index 447e895..fb5a0e6 100644 --- a/ADA/server/agent/product_finder.py +++ b/ADA/server/agent/product_finder.py @@ -1,6 +1,6 @@ import pandas as pd -data_location = 'amazon_reviews_us_Camera_v1_00.tsv' +data_location = 'amazon_reviews_us_camera.tsv' training_data_location = 'reviews_trained_on.tsv' min_characters = 50 reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) diff --git a/ADA/server/agent/review.py b/ADA/server/agent/review.py index 3875165..a8158bd 100644 --- a/ADA/server/agent/review.py +++ b/ADA/server/agent/review.py @@ -1,7 +1,10 @@ import re -from nltk.tokenize import sent_tokenize +from nltk.tokenize import word_tokenize, sent_tokenize from agent.SA.bert_dataset import MAX_SEQ_LEN from anytree import PostOrderIter +from nltk.stem import WordNetLemmatizer + +wnl = WordNetLemmatizer() class Review: @@ -34,7 +37,7 @@ class Review: # normalize for arg in self.votes: self.votes[arg] = 1 if self.votes[arg] > 0 else -1 - self.augment_votes() + # self.augment_votes() return self.votes # augment votes (Definition 4.3) obtained for a single critic @@ -58,20 +61,21 @@ class Phrase: def __init__(self, text, product): self.product = product self.text = text - self.args = self.get_args(text) + self.tokens = [wnl.lemmatize(word.lower()) for word in word_tokenize(text)] + self.args = self.get_args() self.votes = {} # get argument(s) that match phrase - def get_args(self, phrase): + def get_args(self): argument_matches = [] arguments = [node for node in PostOrderIter(self.product.root)] while len(arguments) > 0: - f = arguments.pop(0) - for word in self.product.glossary[f]: - matches = [Arg(f, m.start(), m.end()) for m in re.finditer(word, phrase)] + arg = arguments.pop(0) + for term in self.product.glossary[arg]: + matches = [Arg(arg, start, end) for start, end in Phrase.matching_subsequences(term, self.tokens)] if matches: argument_matches += matches - self.remove_ancestors(f, arguments) + self.remove_ancestors(arg, arguments) break return argument_matches @@ -97,6 +101,15 @@ class Phrase: self.votes[arg.node] = arg.sentiment return self.votes + @staticmethod + def matching_subsequences(l_sub, l): + sub_idxs = [] + len_sub = len(l_sub) + for i in range(len(l)): + if l[i:i+len_sub] == l_sub: + sub_idxs.append((i, i+len_sub)) + return sub_idxs + class Arg: diff --git a/ADA/server/agent/review_annotation.py b/ADA/server/agent/review_annotation.py index 8cd35be..06e424e 100644 --- a/ADA/server/agent/review_annotation.py +++ b/ADA/server/agent/review_annotation.py @@ -12,7 +12,7 @@ import readchar from sty import fg, bg, ef, rs from wcwidth import wcswidth -data_location = 'amazon_data/amazon_reviews_us_PC_v1_00.tsv' +data_location = 'amazon_data/amazon_reviews_us_pc.tsv' selected_reviews_location = 'pc_reviews_to_be_annotated.xml' min_characters = 0 max_characters = 200 diff --git a/ADA/server/agent/target_extraction/product.py b/ADA/server/agent/target_extraction/product.py index a73f2ca..0686f6c 100644 --- a/ADA/server/agent/target_extraction/product.py +++ b/ADA/server/agent/target_extraction/product.py @@ -12,9 +12,10 @@ class Product: self.root = root self.feature_nodes = [n for n in root.descendants] self.argument_nodes = [root] + self.feature_nodes - self.glossary = {a_node: syns for a, syns in syn_dict.items() for a_node in self.argument_nodes - if a_node.name == a} - self.arguments = {a_node: Argument(a_idx, a_node.name) for a_idx, a_node in enumerate(self.argument_nodes)} + self.glossary = {a_node: [syn.split('_') for syn in syns] + for a, syns in syn_dict.items() for a_node in self.argument_nodes if a_node.name == a} + self.arguments = {a_node: Argument(a_idx, a_node.name.replace('_', ' ')) + for a_idx, a_node in enumerate(self.argument_nodes)} def argument_node_for_id(self, id): return self.argument_nodes[id] diff --git a/ADA/server/agent/target_extraction/target_extractor.py b/ADA/server/agent/target_extraction/target_extractor.py index 0490e26..9cebd1e 100644 --- a/ADA/server/agent/target_extraction/target_extractor.py +++ b/ADA/server/agent/target_extraction/target_extractor.py @@ -50,20 +50,26 @@ class TargetExtractor: SYNONYM_SIMILARITY = 0.1 # parent is a TargetExtrator of a parent category, eg. > electronics > camera - def __init__(self, product, texts, parent=None): + def __init__(self, product, texts, parent_texts=None, phraser=None): self.product = product - self.parent = parent print('tokenizing phrases...') # tokenize and normalize phrases self.phrases = [[w.lower() for w in word_tokenize(phrase.replace('_', ' '))] for text in texts for phrase in sent_tokenize(text)] - print('obtaining bigrams...') - # train bigram map - bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD) - trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD) - self.phraser = Phraser(trigram) + if phraser is None: + print('obtaining n-grams...') + # train bigram map + bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD) + trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD) + self.phraser = Phraser(trigram) + else: + self.phraser = phraser + + if parent_texts is not None: + print('analysing parent reviews...') + self.parent = TargetExtractor(product=None, texts=parent_texts, phraser=self.phraser) print('counting terms...') # count terms @@ -71,7 +77,7 @@ class TargetExtractor: self.total_count = sum(self.counter.values()) def save_product_representation(self): - f = open(Product.FILE_DIR + self.product + Product.FILE_EXTENSION, 'wb') + f = open('extracted_products/' + self.product + Product.FILE_EXTENSION, 'wb') p = Product(self.tree, self.syn_dict) pickle.dump(p, f) f.close() @@ -105,7 +111,7 @@ class TargetExtractor: print('extracting aspect tree...') # extract aspect tree - self.tree = self.get_product_tree2() + self.tree = self.get_product_tree() return self.tree, self.syn_dict @@ -179,7 +185,9 @@ class TargetExtractor: nouns = [] for phrase in self.phrases: pos_tags = pos_tag(phrase) - bigrams = [re.sub('_*' + self.product + '_*', '', bigram) if bigram != self.product else bigram + bigrams = [re.sub('_*' + self.product + '_*', '', bigram) + if self.product and bigram != self.product + else bigram for bigram in self.phraser[phrase]] word_idx = 0 for token in bigrams: @@ -189,13 +197,13 @@ class TargetExtractor: has_noun = any(TargetExtractor.is_noun(pos_tags[i]) for i in word_range) all_terms_valid = all(TargetExtractor.is_valid_term(pos_tags[i]) for i in word_range) if has_noun and all_terms_valid: - nouns.append(TargetExtractor.singular(token)) + nouns.append(token) word_idx += len(words) else: is_noun = TargetExtractor.is_noun(pos_tags[word_idx]) is_valid = TargetExtractor.is_valid_term(pos_tags[word_idx]) if len(token) > 1 and is_noun and is_valid: - nouns.append(TargetExtractor.singular(token)) + nouns.append(token) word_idx += 1 return Counter(nouns) @@ -279,42 +287,7 @@ class TargetExtractor: return root - # product has to be at idx 0 - # targets have to be sorted in descending order based on counts (excluding product) - @staticmethod - def get_product_tree(dep_matrix, targets): - remaining_targets = [idx for idx in range(len(targets))] - root = Node(targets[remaining_targets.pop(0)]) - n_null = 3 - dependencies = [None] * n_null + [TargetExtractor.get_significant_dependence( - idx, dep_matrix, ignore_idx=[0]+list(range(idx+1, len(targets)))) for idx in range(n_null, len(targets))] - print(dependencies) - while remaining_targets: - idx = remaining_targets.pop(0) - t_node = Node(targets[idx], parent=root) - t_node.idx = idx - dependants = [(d_idx, dep[1]) for d_idx, dep in enumerate(dependencies) if dep and dep[0] == idx] - print(t_node, [targets[i] for i, _ in dependants]) - for d_idx, _ in sorted(dependants, key=lambda p: p[1], reverse=True): - if d_idx not in remaining_targets: - continue - # parent = root - # if not t_node.children or any(TargetExtractor.are_correlated(d_idx, c.idx, dep_matrix, ignore_idx=[0, idx]) for c in t_node.children): - # parent = t_node - d_node = Node(targets[d_idx], parent=t_node) - d_node.idx = d_idx - remaining_targets.remove(d_idx) - - # for idx, t in enumerate(targets): - # if idx == 0: - # continue - # dep_idx = - # parent = next((d for d in root.descendants if d.idx == dep_idx), root) - # node = Node(t, parent=parent) - # node.idx = idx - return root - - def get_product_tree2(self): + def get_product_tree(self): root = Node(self.aspects[0]) for idx in range(1, TargetExtractor.N_DIRECT_FEATURES + 1): node = Node(self.aspects[idx], parent=root) @@ -500,16 +473,16 @@ class Synset: return None -# electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'review_body')[:300000] -# electronics_extractor = TargetExtractor('device', electronics_texts) +# parent_texts = obtain_texts('data/electronics_reviews.tsv', 'review_body')[:300000] # texts = obtain_texts('data/verified_laptop_reviews.tsv', 'reviewText') -# extractor = TargetExtractor('laptop', texts, parent=electronics_extractor) +# extractor = TargetExtractor(product='laptop', texts=texts, parent_texts=parent_texts) # tree, syns = extractor.get_tree_and_synonyms() # print(RenderTree(tree)) # extractor.save() -extractor: TargetExtractor = TargetExtractor.load_saved('camera') +extractor: TargetExtractor = TargetExtractor.load_saved('laptop') extractor.save_product_representation() + # tree, syns = extractor.get_tree_and_synonyms() # print(RenderTree(tree)) # print(extractor.syn_dict) diff --git a/ADA/server/agent/train_classifier.py b/ADA/server/agent/train_classifier.py index a31912b..0297816 100644 --- a/ADA/server/agent/train_classifier.py +++ b/ADA/server/agent/train_classifier.py @@ -4,7 +4,7 @@ import random from nltk import classify, NaiveBayesClassifier import pickle -data_location = 'amazon_reviews_us_Camera_v1_00.tsv' +data_location = 'amazon_reviews_us_camera.tsv' selected_reviews_output_location = 'reviews_trained_on.tsv' classifier_location = 'camera_review_classifier.pickle' min_characters = 0 diff --git a/ADA/server/ios_server/views.py b/ADA/server/ios_server/views.py index b3506fb..51f0441 100644 --- a/ADA/server/ios_server/views.py +++ b/ADA/server/ios_server/views.py @@ -18,7 +18,6 @@ def product(request): product_type = request.GET.get('type', '') if not communicators: - print(1) communicators.append(Communicator(dl)) communicator = communicators[0] -- GitLab