From ce3937014fef529af7ef9bf0d383abcf5717987e Mon Sep 17 00:00:00 2001
From: Joel Oksanen <jjo2317@ic.ac.uk>
Date: Sat, 9 May 2020 16:51:47 +0300
Subject: [PATCH] Improved integration between target_extractor and server

---
 ADA/server/agent/agent.py                     | 13 ++--
 ADA/server/agent/communicator.py              | 14 ++++
 ADA/server/agent/dataloader.py                |  3 +-
 ADA/server/agent/prep_data.py                 |  2 +-
 ADA/server/agent/prep_metadata.py             |  7 +-
 ADA/server/agent/product_finder.py            |  2 +-
 ADA/server/agent/review.py                    | 29 +++++--
 ADA/server/agent/review_annotation.py         |  2 +-
 ADA/server/agent/target_extraction/product.py |  7 +-
 .../target_extraction/target_extractor.py     | 77 ++++++-------------
 ADA/server/agent/train_classifier.py          |  2 +-
 ADA/server/ios_server/views.py                |  1 -
 12 files changed, 79 insertions(+), 80 deletions(-)

diff --git a/ADA/server/agent/agent.py b/ADA/server/agent/agent.py
index 3be5567..f237644 100644
--- a/ADA/server/agent/agent.py
+++ b/ADA/server/agent/agent.py
@@ -140,16 +140,17 @@ class Agent:
         return max(attacker_strengths, key=attacker_strengths.get)
 
     def liked_argument(self, argument):
-        return self.vote_sum[
-                   argument] >= 0  # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
+        return self.vote_sum[argument] >= 0
+        # self.strengths[argument] > 0.5
+        # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
 
     def supported_argument(self, argument):
-        return (self.get_strongest_supporting_subfeature(argument) is not None and
-                self.strengths[self.get_strongest_supporting_subfeature(argument)] > 0)
+        supp = self.get_strongest_supporting_subfeature(argument)
+        return supp is not None and self.strengths[supp] > 0
 
     def attacked_argument(self, argument):
-        return (self.get_strongest_attacking_subfeature(argument) is not None and
-                self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0)
+        att = self.get_strongest_attacking_subfeature(argument)
+        return att is not None and self.strengths[att] > 0
 
     def best_supporting_phrase(self, argument):
         phrases = {vp['phrase']: vp['sentiment'] for vp in self.supporting_phrases(argument)}
diff --git a/ADA/server/agent/communicator.py b/ADA/server/agent/communicator.py
index fb86eb3..b54939c 100644
--- a/ADA/server/agent/communicator.py
+++ b/ADA/server/agent/communicator.py
@@ -88,6 +88,20 @@ class Communicator:
                 text += '.'
                 args = [q_arg_node, supp_node]
 
+        if query_id == 3:
+            supp_node = self.agent.get_strongest_supporting_subfeature(q_arg_node)
+            att_node = self.agent.get_strongest_attacking_subfeature(q_arg_node)
+            att_name = self.product.argument_for_node(att_node).name
+            text = 'The {} was considered to be poor because the {} {} poor'.format(q_arg.name, att_name,
+                                                                                    self.was_were(att_name))
+            if supp_node:
+                supp_name = self.product.argument_for_node(supp_node).name
+                text += ', although the {} {} good.'.format(supp_name, self.was_were(supp_name))
+                args = [q_arg_node, att_node, supp_node]
+            else:
+                text += '.'
+                args = [q_arg_node, att_node]
+
         if query_id == 4 or query_id == 5:
             phrase = (self.agent.best_supporting_phrase(q_arg_node) if query_id == 4
                       else self.agent.best_attacking_phrase(q_arg_node))
diff --git a/ADA/server/agent/dataloader.py b/ADA/server/agent/dataloader.py
index 0a7cb3a..3908603 100644
--- a/ADA/server/agent/dataloader.py
+++ b/ADA/server/agent/dataloader.py
@@ -1,7 +1,8 @@
 import pandas as pd
 
+
 class DataLoader:
-    data_location = 'agent/amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
+    data_location = 'agent/amazon_data/amazon_reviews_us_pc.tsv'
     reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
 
     def get_reviews(self, product_id):
diff --git a/ADA/server/agent/prep_data.py b/ADA/server/agent/prep_data.py
index f0c7ba8..db78a30 100644
--- a/ADA/server/agent/prep_data.py
+++ b/ADA/server/agent/prep_data.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-data_location = 'amazon_data/amazon_reviews_us_Camera_v1_00.tsv'
+data_location = 'amazon_data/amazon_reviews_us_camera.tsv'
 training_data_location = 'amazon_data/reviews_trained_on.tsv'
 output_location = 'amazon_data/camera_prepared_data.tsv'
 min_reviews = 50
diff --git a/ADA/server/agent/prep_metadata.py b/ADA/server/agent/prep_metadata.py
index 4dd7a48..58ceaba 100644
--- a/ADA/server/agent/prep_metadata.py
+++ b/ADA/server/agent/prep_metadata.py
@@ -25,13 +25,10 @@ def get_df(path):
 pd.set_option('display.max_colwidth', None)
 
 category = 'Backpacks'
-metadata = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True)
-for col in metadata.columns:
-    print(col)
 
-metadata = metadata[metadata['category'].apply(lambda cats: category in cats)]
+metadata_iter = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
+metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: category in cl)] for metadata in metadata_iter])
 
-print(metadata['category'][:5])
 print(len(metadata.index))
 
 review_iter = pd.read_json('amazon_data/Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
diff --git a/ADA/server/agent/product_finder.py b/ADA/server/agent/product_finder.py
index 447e895..fb5a0e6 100644
--- a/ADA/server/agent/product_finder.py
+++ b/ADA/server/agent/product_finder.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
+data_location = 'amazon_reviews_us_camera.tsv'
 training_data_location = 'reviews_trained_on.tsv'
 min_characters = 50
 reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
diff --git a/ADA/server/agent/review.py b/ADA/server/agent/review.py
index 3875165..a8158bd 100644
--- a/ADA/server/agent/review.py
+++ b/ADA/server/agent/review.py
@@ -1,7 +1,10 @@
 import re
-from nltk.tokenize import sent_tokenize
+from nltk.tokenize import word_tokenize, sent_tokenize
 from agent.SA.bert_dataset import MAX_SEQ_LEN
 from anytree import PostOrderIter
+from nltk.stem import WordNetLemmatizer
+
+wnl = WordNetLemmatizer()
 
 
 class Review:
@@ -34,7 +37,7 @@ class Review:
         # normalize
         for arg in self.votes:
             self.votes[arg] = 1 if self.votes[arg] > 0 else -1
-        self.augment_votes()
+        # self.augment_votes()
         return self.votes
 
     # augment votes (Definition 4.3) obtained for a single critic
@@ -58,20 +61,21 @@ class Phrase:
     def __init__(self, text, product):
         self.product = product
         self.text = text
-        self.args = self.get_args(text)
+        self.tokens = [wnl.lemmatize(word.lower()) for word in word_tokenize(text)]
+        self.args = self.get_args()
         self.votes = {}
 
     # get argument(s) that match phrase
-    def get_args(self, phrase):
+    def get_args(self):
         argument_matches = []
         arguments = [node for node in PostOrderIter(self.product.root)]
         while len(arguments) > 0:
-            f = arguments.pop(0)
-            for word in self.product.glossary[f]:
-                matches = [Arg(f, m.start(), m.end()) for m in re.finditer(word, phrase)]
+            arg = arguments.pop(0)
+            for term in self.product.glossary[arg]:
+                matches = [Arg(arg, start, end) for start, end in Phrase.matching_subsequences(term, self.tokens)]
                 if matches:
                     argument_matches += matches
-                    self.remove_ancestors(f, arguments)
+                    self.remove_ancestors(arg, arguments)
                     break
         return argument_matches
 
@@ -97,6 +101,15 @@ class Phrase:
                 self.votes[arg.node] = arg.sentiment
         return self.votes
 
+    @staticmethod
+    def matching_subsequences(l_sub, l):
+        sub_idxs = []
+        len_sub = len(l_sub)
+        for i in range(len(l)):
+            if l[i:i+len_sub] == l_sub:
+                sub_idxs.append((i, i+len_sub))
+        return sub_idxs
+
 
 class Arg:
 
diff --git a/ADA/server/agent/review_annotation.py b/ADA/server/agent/review_annotation.py
index 8cd35be..06e424e 100644
--- a/ADA/server/agent/review_annotation.py
+++ b/ADA/server/agent/review_annotation.py
@@ -12,7 +12,7 @@ import readchar
 from sty import fg, bg, ef, rs
 from wcwidth import wcswidth
 
-data_location = 'amazon_data/amazon_reviews_us_PC_v1_00.tsv'
+data_location = 'amazon_data/amazon_reviews_us_pc.tsv'
 selected_reviews_location = 'pc_reviews_to_be_annotated.xml'
 min_characters = 0
 max_characters = 200
diff --git a/ADA/server/agent/target_extraction/product.py b/ADA/server/agent/target_extraction/product.py
index a73f2ca..0686f6c 100644
--- a/ADA/server/agent/target_extraction/product.py
+++ b/ADA/server/agent/target_extraction/product.py
@@ -12,9 +12,10 @@ class Product:
         self.root = root
         self.feature_nodes = [n for n in root.descendants]
         self.argument_nodes = [root] + self.feature_nodes
-        self.glossary = {a_node: syns for a, syns in syn_dict.items() for a_node in self.argument_nodes
-                         if a_node.name == a}
-        self.arguments = {a_node: Argument(a_idx, a_node.name) for a_idx, a_node in enumerate(self.argument_nodes)}
+        self.glossary = {a_node: [syn.split('_') for syn in syns]
+                         for a, syns in syn_dict.items() for a_node in self.argument_nodes if a_node.name == a}
+        self.arguments = {a_node: Argument(a_idx, a_node.name.replace('_', ' '))
+                          for a_idx, a_node in enumerate(self.argument_nodes)}
 
     def argument_node_for_id(self, id):
         return self.argument_nodes[id]
diff --git a/ADA/server/agent/target_extraction/target_extractor.py b/ADA/server/agent/target_extraction/target_extractor.py
index 0490e26..9cebd1e 100644
--- a/ADA/server/agent/target_extraction/target_extractor.py
+++ b/ADA/server/agent/target_extraction/target_extractor.py
@@ -50,20 +50,26 @@ class TargetExtractor:
     SYNONYM_SIMILARITY = 0.1
 
     # parent is a TargetExtrator of a parent category, eg. > electronics > camera
-    def __init__(self, product, texts, parent=None):
+    def __init__(self, product, texts, parent_texts=None, phraser=None):
         self.product = product
-        self.parent = parent
 
         print('tokenizing phrases...')
         # tokenize and normalize phrases
         self.phrases = [[w.lower() for w in word_tokenize(phrase.replace('_', ' '))]
                         for text in texts for phrase in sent_tokenize(text)]
 
-        print('obtaining bigrams...')
-        # train bigram map
-        bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD)
-        trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD)
-        self.phraser = Phraser(trigram)
+        if phraser is None:
+            print('obtaining n-grams...')
+            # train bigram map
+            bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD)
+            trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD)
+            self.phraser = Phraser(trigram)
+        else:
+            self.phraser = phraser
+
+        if parent_texts is not None:
+            print('analysing parent reviews...')
+            self.parent = TargetExtractor(product=None, texts=parent_texts, phraser=self.phraser)
 
         print('counting terms...')
         # count terms
@@ -71,7 +77,7 @@ class TargetExtractor:
         self.total_count = sum(self.counter.values())
 
     def save_product_representation(self):
-        f = open(Product.FILE_DIR + self.product + Product.FILE_EXTENSION, 'wb')
+        f = open('extracted_products/' + self.product + Product.FILE_EXTENSION, 'wb')
         p = Product(self.tree, self.syn_dict)
         pickle.dump(p, f)
         f.close()
@@ -105,7 +111,7 @@ class TargetExtractor:
 
         print('extracting aspect tree...')
         # extract aspect tree
-        self.tree = self.get_product_tree2()
+        self.tree = self.get_product_tree()
 
         return self.tree, self.syn_dict
 
@@ -179,7 +185,9 @@ class TargetExtractor:
         nouns = []
         for phrase in self.phrases:
             pos_tags = pos_tag(phrase)
-            bigrams = [re.sub('_*' + self.product + '_*', '', bigram) if bigram != self.product else bigram
+            bigrams = [re.sub('_*' + self.product + '_*', '', bigram)
+                       if self.product and bigram != self.product
+                       else bigram
                        for bigram in self.phraser[phrase]]
             word_idx = 0
             for token in bigrams:
@@ -189,13 +197,13 @@ class TargetExtractor:
                     has_noun = any(TargetExtractor.is_noun(pos_tags[i]) for i in word_range)
                     all_terms_valid = all(TargetExtractor.is_valid_term(pos_tags[i]) for i in word_range)
                     if has_noun and all_terms_valid:
-                        nouns.append(TargetExtractor.singular(token))
+                        nouns.append(token)
                     word_idx += len(words)
                 else:
                     is_noun = TargetExtractor.is_noun(pos_tags[word_idx])
                     is_valid = TargetExtractor.is_valid_term(pos_tags[word_idx])
                     if len(token) > 1 and is_noun and is_valid:
-                        nouns.append(TargetExtractor.singular(token))
+                        nouns.append(token)
                     word_idx += 1
 
         return Counter(nouns)
@@ -279,42 +287,7 @@ class TargetExtractor:
 
         return root
 
-    # product has to be at idx 0
-    # targets have to be sorted in descending order based on counts (excluding product)
-    @staticmethod
-    def get_product_tree(dep_matrix, targets):
-        remaining_targets = [idx for idx in range(len(targets))]
-        root = Node(targets[remaining_targets.pop(0)])
-        n_null = 3
-        dependencies = [None] * n_null + [TargetExtractor.get_significant_dependence(
-            idx, dep_matrix, ignore_idx=[0]+list(range(idx+1, len(targets)))) for idx in range(n_null, len(targets))]
-        print(dependencies)
-        while remaining_targets:
-            idx = remaining_targets.pop(0)
-            t_node = Node(targets[idx], parent=root)
-            t_node.idx = idx
-            dependants = [(d_idx, dep[1]) for d_idx, dep in enumerate(dependencies) if dep and dep[0] == idx]
-            print(t_node, [targets[i] for i, _ in dependants])
-            for d_idx, _ in sorted(dependants, key=lambda p: p[1], reverse=True):
-                if d_idx not in remaining_targets:
-                    continue
-                # parent = root
-                # if not t_node.children or any(TargetExtractor.are_correlated(d_idx, c.idx, dep_matrix, ignore_idx=[0, idx]) for c in t_node.children):
-                #     parent = t_node
-                d_node = Node(targets[d_idx], parent=t_node)
-                d_node.idx = d_idx
-                remaining_targets.remove(d_idx)
-
-        # for idx, t in enumerate(targets):
-        #     if idx == 0:
-        #         continue
-        #     dep_idx =
-        #     parent = next((d for d in root.descendants if d.idx == dep_idx), root)
-        #     node = Node(t, parent=parent)
-        #     node.idx = idx
-        return root
-
-    def get_product_tree2(self):
+    def get_product_tree(self):
         root = Node(self.aspects[0])
         for idx in range(1, TargetExtractor.N_DIRECT_FEATURES + 1):
             node = Node(self.aspects[idx], parent=root)
@@ -500,16 +473,16 @@ class Synset:
         return None
 
 
-# electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'review_body')[:300000]
-# electronics_extractor = TargetExtractor('device', electronics_texts)
+# parent_texts = obtain_texts('data/electronics_reviews.tsv', 'review_body')[:300000]
 # texts = obtain_texts('data/verified_laptop_reviews.tsv', 'reviewText')
-# extractor = TargetExtractor('laptop', texts, parent=electronics_extractor)
+# extractor = TargetExtractor(product='laptop', texts=texts, parent_texts=parent_texts)
 # tree, syns = extractor.get_tree_and_synonyms()
 # print(RenderTree(tree))
 # extractor.save()
 
-extractor: TargetExtractor = TargetExtractor.load_saved('camera')
+extractor: TargetExtractor = TargetExtractor.load_saved('laptop')
 extractor.save_product_representation()
+
 # tree, syns = extractor.get_tree_and_synonyms()
 # print(RenderTree(tree))
 # print(extractor.syn_dict)
diff --git a/ADA/server/agent/train_classifier.py b/ADA/server/agent/train_classifier.py
index a31912b..0297816 100644
--- a/ADA/server/agent/train_classifier.py
+++ b/ADA/server/agent/train_classifier.py
@@ -4,7 +4,7 @@ import random
 from nltk import classify, NaiveBayesClassifier
 import pickle
 
-data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
+data_location = 'amazon_reviews_us_camera.tsv'
 selected_reviews_output_location = 'reviews_trained_on.tsv'
 classifier_location = 'camera_review_classifier.pickle'
 min_characters = 0
diff --git a/ADA/server/ios_server/views.py b/ADA/server/ios_server/views.py
index b3506fb..51f0441 100644
--- a/ADA/server/ios_server/views.py
+++ b/ADA/server/ios_server/views.py
@@ -18,7 +18,6 @@ def product(request):
     product_type = request.GET.get('type', '')
 
     if not communicators:
-        print(1)
         communicators.append(Communicator(dl))
     communicator = communicators[0]
 
-- 
GitLab