Implemented word2vec in detecting synonyms

8c3b320d · Joel Oksanen · 7d9fd221 · 8c3b320d · 8c3b320d
Commit 8c3b320d authored 4 years ago by Joel Oksanen
--- a/ADA/server/agent/prep_metadata.py
+++ b/ADA/server/agent/prep_metadata.py
@@ -2,7 +2,7 @@ import pandas as pd
 import gzip
 import json

-MAX_ITEMS = 150000
+MAX_ITEMS = 200000


 def parse(path):
@@ -17,19 +17,30 @@ def get_df(path):
    for d in parse(path):
        df[i] = d
        i += 1
-        if i == 1000000:
+        if i == MAX_ITEMS:
            break
    return pd.DataFrame.from_dict(df, orient='index')


-metadata = get_df('amazon_data/Electronics.json.gz')
-output_location = 'target_extraction/data/electronics_reviews.tsv'
+child_product = 'speaker'
+reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
+                      compression='gzip')
+parent_output = 'target_extraction/data/electronics_reviews.tsv'
+child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'

-for col in metadata.columns:
+for col in reviews.columns:
    print(col)

-metadata = metadata.sample(frac=1).reset_index(drop=True)
-metadata = metadata.head(MAX_ITEMS)
+c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
+p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
+c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
+c_reviews = c_reviews.head(MAX_ITEMS)
+p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
+p_reviews = p_reviews.head(MAX_ITEMS)
+
+p_reviews.to_csv(parent_output, sep='\t', index=False)
+c_reviews.to_csv(child_output, sep='\t', index=False)
+print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')

 # # get metadata for sunglasses
 # metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
@@ -55,5 +66,4 @@ metadata = metadata.head(MAX_ITEMS)
 #     print('tech1:', row['tech1'])
 #     print('tech2:', row['tech2'])

-metadata.to_csv(output_location, sep='\t', index=False)
-print('Successfully prepared data for', len(metadata.index), 'reviews')
+
--- a/ADA/server/agent/target_extraction/target_extractor.py
+++ b/ADA/server/agent/target_extraction/target_extractor.py
@@ -12,6 +12,7 @@ from anytree import Node, RenderTree
 import itertools
 import numpy as np
 import re
+from gensim.models import Word2Vec

 stop_words = stopwords.words('english')
 wnl = WordNetLemmatizer()
@@ -29,7 +30,11 @@ class TargetExtractor:
    N_ASPECTS = 30
    MIN_DIRECT_GAIN = 0.1
    DEPTH_COST = 0.3
-    FREQ_OVER_PARENT = 2  # target must appear x times more frequently than in parent
+    FREQ_OVER_PARENT = 3  # target must appear x times more frequently than in parent
+
+    # word2vec
+    MIN_SIMILARITY = 0
+    SYNONYM_SIMILARITY = 0.7

    # parent is a TargetExtrator of a parent category, eg. > electronics > camera
    def __init__(self, product, texts, parent=None):
@@ -37,7 +42,7 @@ class TargetExtractor:
        self.parent = parent

        # tokenize and normalize phrases
-        self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
+        self.phrases = [[w.lower() for w in word_tokenize(phrase.replace('_', ' '))]
                        for phrase in texts]

        # train bigram map
@@ -47,16 +52,20 @@ class TargetExtractor:
        # count terms
        self.counter = self.count_nouns()
        self.total_count = sum(self.counter.values())
-        print(parent, self.total_count)

    def get_tree_and_synonyms(self):
+        # train word2vec model
+        wv = self.get_word2vec_model()
+
        # mine aspects
-        aspects, counts = self.get_related_nouns(self.counter)
+        aspects, counts = self.get_related_nouns(self.counter, wv)

        print(aspects)

        # obtain synonyms
-        synset = Synset(aspects)
+        syn_pairs = self.get_syn_pairs(aspects, wv)
+        print(syn_pairs)
+        synset = Synset(aspects, syn_pairs)
        syn_dict = synset.get_dict(counts)

        # remove aspect synonyms
@@ -110,27 +119,25 @@ class TargetExtractor:
                if '_' in token:
                    words = token.split('_')
                    if any(TargetExtractor.is_noun(pos_tags[i]) for i in range(word_idx, word_idx + len(words))):
-                        nouns.append(token)
+                        nouns.append(TargetExtractor.singular(token))
                    word_idx += len(words)
                else:
                    if len(token) > 1 and TargetExtractor.is_noun(pos_tags[word_idx]):
-                        nouns.append(token)
+                        nouns.append(TargetExtractor.singular(token))
                    word_idx += 1

        return Counter(nouns)

-    def get_related_nouns(self, counter):
+    def get_related_nouns(self, counter, wv):
        common = counter.most_common()

        term_counts = []
        while len(term_counts) < TargetExtractor.N_ASPECTS:
            term, count = common.pop(0)
-            print(term)
+            print(term, count)
            # filter terms not related to the product
            # cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
-            if (not self.parent or self.parent.frequency_for_term(term) == 0 or
-                    self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
-                    TargetExtractor.FREQ_OVER_PARENT):
+            if self.is_related_to_product(term, wv):
                term_counts.append((term, count))

        terms = [term for term, count in term_counts]
@@ -143,9 +150,23 @@ class TargetExtractor:

        return terms, {term: count for term, count in term_counts}

+    def is_related_to_product(self, term, wv):
+        return (term in wv.vocab and wv.similarity(self.product, term) > TargetExtractor.MIN_SIMILARITY and
+                (not self.parent or self.parent.frequency_for_term(term) == 0 or
+                 self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
+                 TargetExtractor.FREQ_OVER_PARENT))
+
+    @staticmethod
+    def get_syn_pairs(terms, model):
+        return {frozenset((t1, t2)) for t1 in terms for t2 in terms
+                if t1 != t2 and model.similarity(t1, t2) > TargetExtractor.SYNONYM_SIMILARITY}
+
    def frequency_for_term(self, term):
        return self.counter[term] / self.total_count

+    def get_word2vec_model(self):
+        return Word2Vec(self.phrases, min_count=5).wv
+
    @staticmethod
    def wordnet_relatedness(t1, t2):
        fst = wordnet.synset(t1 + '.n.01')
@@ -199,18 +220,9 @@ class TargetExtractor:

 class Synset:

-    def __init__(self, aspects):
+    def __init__(self, aspects, syn_pairs):
        self.vocab = aspects
-        self.syn_pairs = {frozenset((aspect, syn)) for aspect in aspects
-                          for syn in self.get_syns(aspect) if aspect != syn}
-
-    def get_syns(self, word):
-        syns = set()
-        for syn in wordnet.synsets(word, pos=wordnet.NOUN):
-            for lemma in syn.lemmas():
-                syns.add(lemma.name())
-        syns = {syn for syn in syns if syn in self.vocab and cnet.get_relatedness(syn, word) > 0.5}
-        return syns
+        self.syn_pairs = syn_pairs

    def get_dict(self, counts):
        groups = self.get_groups()
@@ -231,6 +243,8 @@ class Synset:
    def join_groups(w1, w2, groups):
        g1 = Synset.group_for(w1, groups)
        g2 = Synset.group_for(w2, groups)
+        if g1 and g2 and g1 == g2:
+            return True
        if g1:
            groups.remove(g1)
        if g2:
@@ -260,13 +274,11 @@ class Synset:
        return None


-electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'reviewText')
-print(1)
-electronics_extractor = TargetExtractor('device', electronics_texts)
-print(2)
-camera_texts = obtain_texts('data/camera_metadata.tsv', 'feature')
-print(3)
-camera_extractor = TargetExtractor('camera', camera_texts, parent=electronics_extractor)
-tree, synonyms = camera_extractor.get_tree_and_synonyms()
+laptop_texts = obtain_texts('data/laptop_reviews.tsv', 'review_body')
+laptop_extractor = TargetExtractor('laptop', laptop_texts)
+camera_texts = obtain_texts('data/camera_prepared_data.tsv', 'review_body')
+camera_extractor = TargetExtractor('camera', camera_texts, parent=laptop_extractor)
+tree, syns = camera_extractor.get_tree_and_synonyms()
 print(RenderTree(tree))
-print(synonyms)
+print(syns)
+