Commit 8c3b320d authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented word2vec in detecting synonyms

parent 7d9fd221
......@@ -2,7 +2,7 @@ import pandas as pd
import gzip
import json
MAX_ITEMS = 150000
MAX_ITEMS = 200000
def parse(path):
......@@ -17,19 +17,30 @@ def get_df(path):
for d in parse(path):
df[i] = d
i += 1
if i == 1000000:
if i == MAX_ITEMS:
break
return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/Electronics.json.gz')
output_location = 'target_extraction/data/electronics_reviews.tsv'
child_product = 'speaker'
reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
compression='gzip')
parent_output = 'target_extraction/data/electronics_reviews.tsv'
child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
for col in metadata.columns:
for col in reviews.columns:
print(col)
metadata = metadata.sample(frac=1).reset_index(drop=True)
metadata = metadata.head(MAX_ITEMS)
c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
c_reviews = c_reviews.head(MAX_ITEMS)
p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
p_reviews = p_reviews.head(MAX_ITEMS)
p_reviews.to_csv(parent_output, sep='\t', index=False)
c_reviews.to_csv(child_output, sep='\t', index=False)
print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
......@@ -55,5 +66,4 @@ metadata = metadata.head(MAX_ITEMS)
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'reviews')
......@@ -12,6 +12,7 @@ from anytree import Node, RenderTree
import itertools
import numpy as np
import re
from gensim.models import Word2Vec
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
......@@ -29,7 +30,11 @@ class TargetExtractor:
N_ASPECTS = 30
MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3
FREQ_OVER_PARENT = 2 # target must appear x times more frequently than in parent
FREQ_OVER_PARENT = 3 # target must appear x times more frequently than in parent
# word2vec
MIN_SIMILARITY = 0
SYNONYM_SIMILARITY = 0.7
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def __init__(self, product, texts, parent=None):
......@@ -37,7 +42,7 @@ class TargetExtractor:
self.parent = parent
# tokenize and normalize phrases
self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
self.phrases = [[w.lower() for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in texts]
# train bigram map
......@@ -47,16 +52,20 @@ class TargetExtractor:
# count terms
self.counter = self.count_nouns()
self.total_count = sum(self.counter.values())
print(parent, self.total_count)
def get_tree_and_synonyms(self):
# train word2vec model
wv = self.get_word2vec_model()
# mine aspects
aspects, counts = self.get_related_nouns(self.counter)
aspects, counts = self.get_related_nouns(self.counter, wv)
print(aspects)
# obtain synonyms
synset = Synset(aspects)
syn_pairs = self.get_syn_pairs(aspects, wv)
print(syn_pairs)
synset = Synset(aspects, syn_pairs)
syn_dict = synset.get_dict(counts)
# remove aspect synonyms
......@@ -110,27 +119,25 @@ class TargetExtractor:
if '_' in token:
words = token.split('_')
if any(TargetExtractor.is_noun(pos_tags[i]) for i in range(word_idx, word_idx + len(words))):
nouns.append(token)
nouns.append(TargetExtractor.singular(token))
word_idx += len(words)
else:
if len(token) > 1 and TargetExtractor.is_noun(pos_tags[word_idx]):
nouns.append(token)
nouns.append(TargetExtractor.singular(token))
word_idx += 1
return Counter(nouns)
def get_related_nouns(self, counter):
def get_related_nouns(self, counter, wv):
common = counter.most_common()
term_counts = []
while len(term_counts) < TargetExtractor.N_ASPECTS:
term, count = common.pop(0)
print(term)
print(term, count)
# filter terms not related to the product
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
if (not self.parent or self.parent.frequency_for_term(term) == 0 or
self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
TargetExtractor.FREQ_OVER_PARENT):
if self.is_related_to_product(term, wv):
term_counts.append((term, count))
terms = [term for term, count in term_counts]
......@@ -143,9 +150,23 @@ class TargetExtractor:
return terms, {term: count for term, count in term_counts}
def is_related_to_product(self, term, wv):
return (term in wv.vocab and wv.similarity(self.product, term) > TargetExtractor.MIN_SIMILARITY and
(not self.parent or self.parent.frequency_for_term(term) == 0 or
self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
TargetExtractor.FREQ_OVER_PARENT))
@staticmethod
def get_syn_pairs(terms, model):
return {frozenset((t1, t2)) for t1 in terms for t2 in terms
if t1 != t2 and model.similarity(t1, t2) > TargetExtractor.SYNONYM_SIMILARITY}
def frequency_for_term(self, term):
return self.counter[term] / self.total_count
def get_word2vec_model(self):
return Word2Vec(self.phrases, min_count=5).wv
@staticmethod
def wordnet_relatedness(t1, t2):
fst = wordnet.synset(t1 + '.n.01')
......@@ -199,18 +220,9 @@ class TargetExtractor:
class Synset:
def __init__(self, aspects):
def __init__(self, aspects, syn_pairs):
self.vocab = aspects
self.syn_pairs = {frozenset((aspect, syn)) for aspect in aspects
for syn in self.get_syns(aspect) if aspect != syn}
def get_syns(self, word):
syns = set()
for syn in wordnet.synsets(word, pos=wordnet.NOUN):
for lemma in syn.lemmas():
syns.add(lemma.name())
syns = {syn for syn in syns if syn in self.vocab and cnet.get_relatedness(syn, word) > 0.5}
return syns
self.syn_pairs = syn_pairs
def get_dict(self, counts):
groups = self.get_groups()
......@@ -231,6 +243,8 @@ class Synset:
def join_groups(w1, w2, groups):
g1 = Synset.group_for(w1, groups)
g2 = Synset.group_for(w2, groups)
if g1 and g2 and g1 == g2:
return True
if g1:
groups.remove(g1)
if g2:
......@@ -260,13 +274,11 @@ class Synset:
return None
electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'reviewText')
print(1)
electronics_extractor = TargetExtractor('device', electronics_texts)
print(2)
camera_texts = obtain_texts('data/camera_metadata.tsv', 'feature')
print(3)
camera_extractor = TargetExtractor('camera', camera_texts, parent=electronics_extractor)
tree, synonyms = camera_extractor.get_tree_and_synonyms()
laptop_texts = obtain_texts('data/laptop_reviews.tsv', 'review_body')
laptop_extractor = TargetExtractor('laptop', laptop_texts)
camera_texts = obtain_texts('data/camera_prepared_data.tsv', 'review_body')
camera_extractor = TargetExtractor('camera', camera_texts, parent=laptop_extractor)
tree, syns = camera_extractor.get_tree_and_synonyms()
print(RenderTree(tree))
print(synonyms)
print(syns)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment