Skip to content
Snippets Groups Projects
Commit 8c3b320d authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented word2vec in detecting synonyms

parent 7d9fd221
No related branches found
No related tags found
No related merge requests found
......@@ -2,7 +2,7 @@ import pandas as pd
import gzip
import json
MAX_ITEMS = 150000
MAX_ITEMS = 200000
def parse(path):
......@@ -17,19 +17,30 @@ def get_df(path):
for d in parse(path):
df[i] = d
i += 1
if i == 1000000:
if i == MAX_ITEMS:
break
return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/Electronics.json.gz')
output_location = 'target_extraction/data/electronics_reviews.tsv'
child_product = 'speaker'
reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
compression='gzip')
parent_output = 'target_extraction/data/electronics_reviews.tsv'
child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
for col in metadata.columns:
for col in reviews.columns:
print(col)
metadata = metadata.sample(frac=1).reset_index(drop=True)
metadata = metadata.head(MAX_ITEMS)
c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
c_reviews = c_reviews.head(MAX_ITEMS)
p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
p_reviews = p_reviews.head(MAX_ITEMS)
p_reviews.to_csv(parent_output, sep='\t', index=False)
c_reviews.to_csv(child_output, sep='\t', index=False)
print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
......@@ -55,5 +66,4 @@ metadata = metadata.head(MAX_ITEMS)
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'reviews')
......@@ -12,6 +12,7 @@ from anytree import Node, RenderTree
import itertools
import numpy as np
import re
from gensim.models import Word2Vec
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
......@@ -29,7 +30,11 @@ class TargetExtractor:
N_ASPECTS = 30
MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3
FREQ_OVER_PARENT = 2 # target must appear x times more frequently than in parent
FREQ_OVER_PARENT = 3 # target must appear x times more frequently than in parent
# word2vec
MIN_SIMILARITY = 0
SYNONYM_SIMILARITY = 0.7
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def __init__(self, product, texts, parent=None):
......@@ -37,7 +42,7 @@ class TargetExtractor:
self.parent = parent
# tokenize and normalize phrases
self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
self.phrases = [[w.lower() for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in texts]
# train bigram map
......@@ -47,16 +52,20 @@ class TargetExtractor:
# count terms
self.counter = self.count_nouns()
self.total_count = sum(self.counter.values())
print(parent, self.total_count)
def get_tree_and_synonyms(self):
# train word2vec model
wv = self.get_word2vec_model()
# mine aspects
aspects, counts = self.get_related_nouns(self.counter)
aspects, counts = self.get_related_nouns(self.counter, wv)
print(aspects)
# obtain synonyms
synset = Synset(aspects)
syn_pairs = self.get_syn_pairs(aspects, wv)
print(syn_pairs)
synset = Synset(aspects, syn_pairs)
syn_dict = synset.get_dict(counts)
# remove aspect synonyms
......@@ -110,27 +119,25 @@ class TargetExtractor:
if '_' in token:
words = token.split('_')
if any(TargetExtractor.is_noun(pos_tags[i]) for i in range(word_idx, word_idx + len(words))):
nouns.append(token)
nouns.append(TargetExtractor.singular(token))
word_idx += len(words)
else:
if len(token) > 1 and TargetExtractor.is_noun(pos_tags[word_idx]):
nouns.append(token)
nouns.append(TargetExtractor.singular(token))
word_idx += 1
return Counter(nouns)
def get_related_nouns(self, counter):
def get_related_nouns(self, counter, wv):
common = counter.most_common()
term_counts = []
while len(term_counts) < TargetExtractor.N_ASPECTS:
term, count = common.pop(0)
print(term)
print(term, count)
# filter terms not related to the product
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
if (not self.parent or self.parent.frequency_for_term(term) == 0 or
self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
TargetExtractor.FREQ_OVER_PARENT):
if self.is_related_to_product(term, wv):
term_counts.append((term, count))
terms = [term for term, count in term_counts]
......@@ -143,9 +150,23 @@ class TargetExtractor:
return terms, {term: count for term, count in term_counts}
def is_related_to_product(self, term, wv):
return (term in wv.vocab and wv.similarity(self.product, term) > TargetExtractor.MIN_SIMILARITY and
(not self.parent or self.parent.frequency_for_term(term) == 0 or
self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
TargetExtractor.FREQ_OVER_PARENT))
@staticmethod
def get_syn_pairs(terms, model):
return {frozenset((t1, t2)) for t1 in terms for t2 in terms
if t1 != t2 and model.similarity(t1, t2) > TargetExtractor.SYNONYM_SIMILARITY}
def frequency_for_term(self, term):
return self.counter[term] / self.total_count
def get_word2vec_model(self):
return Word2Vec(self.phrases, min_count=5).wv
@staticmethod
def wordnet_relatedness(t1, t2):
fst = wordnet.synset(t1 + '.n.01')
......@@ -199,18 +220,9 @@ class TargetExtractor:
class Synset:
def __init__(self, aspects):
def __init__(self, aspects, syn_pairs):
self.vocab = aspects
self.syn_pairs = {frozenset((aspect, syn)) for aspect in aspects
for syn in self.get_syns(aspect) if aspect != syn}
def get_syns(self, word):
syns = set()
for syn in wordnet.synsets(word, pos=wordnet.NOUN):
for lemma in syn.lemmas():
syns.add(lemma.name())
syns = {syn for syn in syns if syn in self.vocab and cnet.get_relatedness(syn, word) > 0.5}
return syns
self.syn_pairs = syn_pairs
def get_dict(self, counts):
groups = self.get_groups()
......@@ -231,6 +243,8 @@ class Synset:
def join_groups(w1, w2, groups):
g1 = Synset.group_for(w1, groups)
g2 = Synset.group_for(w2, groups)
if g1 and g2 and g1 == g2:
return True
if g1:
groups.remove(g1)
if g2:
......@@ -260,13 +274,11 @@ class Synset:
return None
electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'reviewText')
print(1)
electronics_extractor = TargetExtractor('device', electronics_texts)
print(2)
camera_texts = obtain_texts('data/camera_metadata.tsv', 'feature')
print(3)
camera_extractor = TargetExtractor('camera', camera_texts, parent=electronics_extractor)
tree, synonyms = camera_extractor.get_tree_and_synonyms()
laptop_texts = obtain_texts('data/laptop_reviews.tsv', 'review_body')
laptop_extractor = TargetExtractor('laptop', laptop_texts)
camera_texts = obtain_texts('data/camera_prepared_data.tsv', 'review_body')
camera_extractor = TargetExtractor('camera', camera_texts, parent=laptop_extractor)
tree, syns = camera_extractor.get_tree_and_synonyms()
print(RenderTree(tree))
print(synonyms)
print(syns)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment