Commit ef388c59 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

A lot of changed to target extractor. Performs decently on cameras and laptops now.

parent 8c3b320d
......@@ -4,3 +4,4 @@ __pycache__/
server/agent/amazon_data/
server/agent/target_extraction/data/
.DS_Store
*.pickle
\ No newline at end of file
......@@ -22,25 +22,44 @@ def get_df(path):
return pd.DataFrame.from_dict(df, orient='index')
child_product = 'speaker'
reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
compression='gzip')
parent_output = 'target_extraction/data/electronics_reviews.tsv'
child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
pd.set_option('display.max_colwidth', None)
for col in reviews.columns:
category = 'Laptops'
metadata = pd.read_json('amazon_data/meta_Electronics.json', lines=True)# get_df('amazon_data/meta_Electronics.json.gz')
for col in metadata.columns:
print(col)
c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
c_reviews = c_reviews.head(MAX_ITEMS)
p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
p_reviews = p_reviews.head(MAX_ITEMS)
metadata = metadata[metadata['category'].apply(lambda cats: category in cats)]
p_reviews.to_csv(parent_output, sep='\t', index=False)
c_reviews.to_csv(child_output, sep='\t', index=False)
print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
print(metadata['category'][:5])
print(len(metadata.index))
review_iter = pd.read_json('amazon_data/Electronics.json', lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
reviews.to_csv('target_extraction/data/verified_laptop_reviews.tsv', sep='\t', index=False)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
# compression='gzip')
# parent_output = 'target_extraction/data/electronics_reviews.tsv'
# child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
#
# for col in reviews.columns:
# print(col)
#
# c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
# p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
# c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
# c_reviews = c_reviews.head(MAX_ITEMS)
# p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
# p_reviews = p_reviews.head(MAX_ITEMS)
#
# p_reviews.to_csv(parent_output, sep='\t', index=False)
# c_reviews.to_csv(child_output, sep='\t', index=False)
# print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
......
......@@ -2,110 +2,169 @@ import pandas as pd
import ast
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
from gensim.models.phrases import Phrases, Phraser
from concept_net import ConceptNet
from anytree import Node, RenderTree
import itertools
import numpy as np
import re
from gensim.models import Word2Vec
from gensim.models import Word2Vec, KeyedVectors
import pickle
import math
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
cnet = ConceptNet()
def obtain_texts(path, col):
file = pd.read_csv(path, sep='\t', error_bad_lines=False)
return [text for _, text in file[col].items() if not pd.isnull(text)]
#for text in ast.literal_eval(texts)]
def obtain_review_texts(path, title_col, review_col):
file = pd.read_csv(path, sep='\t', error_bad_lines=False)
return [(row[title_col], row[review_col]) for _, row in file.iterrows()
if not pd.isnull(row[title_col]) and not pd.isnull(row[review_col])]
class TargetExtractor:
PHRASE_THRESHOLD = 4
MIN_RELATEDNESS = 0.3
N_ASPECTS = 30
N_ASPECTS = 50
MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3
FREQ_OVER_PARENT = 3 # target must appear x times more frequently than in parent
FREQ_OVER_PARENT = 10 # target must appear x times more frequently than in parent
OUTLIER_COEFFICIENT = 5
N_DIRECT_FEATURES = 3 # top N_DIRECT_FEATURES features will be direct children of the product (not subfeatures)
PARENT_COUNT_FRAC = 0.5 # feature f1 will only be considered as a subfeature of f2 if c(f1) / c(f2) > this value
# word2vec
MIN_TERM_COUNT = 100
MIN_SIMILARITY = 0
SYNONYM_SIMILARITY = 0.7
SYNONYM_SIMILARITY = 0.1
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def __init__(self, product, texts, parent=None):
self.product = product
self.parent = parent
print('tokenizing phrases...')
# tokenize and normalize phrases
self.phrases = [[w.lower() for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in texts]
for text in texts for phrase in sent_tokenize(text)]
print('obtaining bigrams...')
# train bigram map
tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases)
bigram = Phrases(self.phrases, threshold=TargetExtractor.PHRASE_THRESHOLD)
trigram = Phrases(bigram[self.phrases], threshold=TargetExtractor.PHRASE_THRESHOLD)
self.phraser = Phraser(trigram)
print('counting terms...')
# count terms
self.counter = self.count_nouns()
self.total_count = sum(self.counter.values())
def get_tree_and_synonyms(self):
print('training word2vec model...')
# train word2vec model
wv = self.get_word2vec_model()
self.wv = self.get_word2vec_model()
print('mining aspects...')
# mine aspects
aspects, counts = self.get_related_nouns(self.counter, wv)
print(aspects)
aspects, counts = self.get_related_nouns(self.counter, self.wv)
print('extracting synonyms...')
# obtain synonyms
syn_pairs = self.get_syn_pairs(aspects, wv)
print(syn_pairs)
syn_pairs = self.get_syn_pairs(aspects, self.wv)
synset = Synset(aspects, syn_pairs)
syn_dict = synset.get_dict(counts)
self.syn_dict = synset.get_dict(counts)
# remove aspect synonyms
aspects = [aspect for aspect in aspects if aspect in syn_dict.keys()]
counts = {aspect: sum(counts[syn] for syn in syn_dict[aspect])
for aspect, count in counts.items() if aspect in aspects}
# remove aspect synonyms and reorder list based on sum of all synonym counts
aspects = [aspect for aspect in aspects if aspect in self.syn_dict.keys()]
self.counts = {aspect: sum(counts[syn] for syn in self.syn_dict[aspect]) for aspect in aspects}
self.aspects = sorted(aspects, key=self.counts.get, reverse=True)
print(self.aspects)
print(self.syn_dict)
print('extracting relatedness matrix...')
# extract relationships between aspects
relatedness_matrix = self.get_relations(aspects, counts, syn_dict)
self.relatedness_matrix = self.get_scaled_relations()
print('extracting aspect tree...')
# extract aspect tree
tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
self.tree = self.get_product_tree2()
return self.tree, self.syn_dict
def get_scaled_relations(self):
relatedness_matrix = np.ma.zeros((len(self.aspects), len(self.aspects)))
relatedness_matrix.mask = False
for tokenized_phrase in self.phrases:
phrase = self.phraser[tokenized_phrase]
matches = {a_idx: phrase_idxs for a_idx, phrase_idxs in
{a_idx: [phrase_idx for syn in self.syn_dict[aspect] for
phrase_idx, term in enumerate(phrase) if syn == term]
for a_idx, aspect in enumerate(self.aspects)}.items()
if len(phrase_idxs) > 0}
if len(matches) != 2:
continue
(idx1, p_idxs1), (idx2, p_idxs2) = matches.items()
combiner_indices = {(min(idx1, idx2) + 1, max(idx1, idx2) - 1) for idx1 in p_idxs1 for idx2 in p_idxs2 if abs(idx1 - idx2) < 10}
if any(any(combiner in ' '.join(phrase[start_idx:(end_idx+1)]) for combiner in ['and', 'as well as', 'in addition to']) for start_idx, end_idx in combiner_indices):
continue
relatedness_matrix[idx1][idx2] += 1
relatedness_matrix[idx2][idx1] += 1
for idx1, t1 in enumerate(self.aspects):
for idx2, t2 in enumerate(self.aspects):
relatedness_matrix[idx1][idx2] = relatedness_matrix[idx1][idx2] / (self.counts[t1] * math.sqrt(self.counts[t2])) # math.sqrt(pow(counts[t1], 2) + pow(counts[t2], 2))
# mask value if it will not be considered as a parent
if idx1 == idx2 or self.counts[t2] / self.counts[t1] < TargetExtractor.PARENT_COUNT_FRAC:
relatedness_matrix[idx1][idx2] = np.ma.masked
relatedness_matrix = np.divide(relatedness_matrix, np.amax(relatedness_matrix))
return tree, syn_dict
return relatedness_matrix
def get_relations(self, targets, counts, syn_dict):
pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)}
relatedness_matrix = np.zeros((len(targets), len(targets)))
for phrase in self.phrases:
bigrams = self.bigrammer[phrase]
for pair in pair_counts:
t1, t2 = pair
if (any(term in bigrams for term in syn_dict[t1]) and
any(term in bigrams for term in syn_dict[t2])):
pair_counts[pair] += 1
for tokenized_phrase in self.phrases:
phrase = self.phraser[tokenized_phrase]
matches = {t_idx for t_idx, target in enumerate(targets) if any(t in phrase for t in syn_dict[target])}
for idx1, idx2 in {frozenset((idx1, idx2)) for idx1 in matches for idx2 in matches if idx1 != idx2}:
relatedness_matrix[idx1][idx2] += 1
relatedness_matrix[idx2][idx1] += 1
counts_arr = np.zeros(len(targets))
for idx, target in enumerate(targets):
counts_arr[idx] = counts[target]
relatedness_matrix = (relatedness_matrix.T / counts_arr).T
return relatedness_matrix
def get_exclusive_relations(self, targets, counts, syn_dict):
relatedness_matrix = np.zeros((len(targets), len(targets)))
for row in range(0, len(targets) - 1):
for col in range(row + 1, len(targets)):
t1 = targets[row]
t2 = targets[col]
score = pair_counts[(t1, t2)] / (counts[t1] * counts[t2])
relatedness_matrix[row][col] = score
for tokenized_phrase in self.phrases:
phrase = self.phraser[tokenized_phrase]
matches = {t_idx for t_idx, target in enumerate(targets) if any(t in phrase for t in syn_dict[target])}
if len(matches) == 2:
idx1, idx2 = matches
relatedness_matrix[idx1][idx2] += 1
relatedness_matrix[idx2][idx1] += 1
for col in range(0, len(targets) - 1):
for row in range(col + 1, len(targets)):
relatedness_matrix[row][col] = relatedness_matrix[col][row]
counts_arr = np.zeros(len(targets))
for idx, target in enumerate(targets):
counts_arr[idx] = counts[target]
relatedness_matrix = np.divide(relatedness_matrix, np.amax(relatedness_matrix))
relatedness_matrix = (relatedness_matrix.T / counts_arr).T
return relatedness_matrix
def count_nouns(self):
......@@ -113,16 +172,21 @@ class TargetExtractor:
for phrase in self.phrases:
pos_tags = pos_tag(phrase)
bigrams = [re.sub('_*' + self.product + '_*', '', bigram) if bigram != self.product else bigram
for bigram in self.bigrammer[phrase]]
for bigram in self.phraser[phrase]]
word_idx = 0
for token in bigrams:
if '_' in token:
words = token.split('_')
if any(TargetExtractor.is_noun(pos_tags[i]) for i in range(word_idx, word_idx + len(words))):
word_range = range(word_idx, word_idx + len(words))
has_noun = any(TargetExtractor.is_noun(pos_tags[i]) for i in word_range)
all_terms_valid = all(TargetExtractor.is_valid_term(pos_tags[i]) for i in word_range)
if has_noun and all_terms_valid:
nouns.append(TargetExtractor.singular(token))
word_idx += len(words)
else:
if len(token) > 1 and TargetExtractor.is_noun(pos_tags[word_idx]):
is_noun = TargetExtractor.is_noun(pos_tags[word_idx])
is_valid = TargetExtractor.is_valid_term(pos_tags[word_idx])
if len(token) > 1 and is_noun and is_valid:
nouns.append(TargetExtractor.singular(token))
word_idx += 1
......@@ -134,11 +198,12 @@ class TargetExtractor:
term_counts = []
while len(term_counts) < TargetExtractor.N_ASPECTS:
term, count = common.pop(0)
print(term, count)
# filter terms not related to the product
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
if self.is_related_to_product(term, wv):
term_counts.append((term, count))
print('accepted:', term)
else:
print('rejected:', term)
terms = [term for term, count in term_counts]
# bring product to front of list
......@@ -155,17 +220,31 @@ class TargetExtractor:
(not self.parent or self.parent.frequency_for_term(term) == 0 or
self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
TargetExtractor.FREQ_OVER_PARENT))
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
@staticmethod
def get_syn_pairs(terms, model):
return {frozenset((t1, t2)) for t1 in terms for t2 in terms
if t1 != t2 and model.similarity(t1, t2) > TargetExtractor.SYNONYM_SIMILARITY}
if t1 != t2 and model.relative_cosine_similarity(t1, t2) > TargetExtractor.SYNONYM_SIMILARITY}
def frequency_for_term(self, term):
return self.counter[term] / self.total_count
def get_word2vec_model(self):
return Word2Vec(self.phrases, min_count=5).wv
model = Word2Vec(self.phraser[self.phrases], min_count=TargetExtractor.MIN_TERM_COUNT).wv
return model
def save(self):
f = open(self.product + '_extractor.pickle', 'wb')
pickle.dump(self, f)
f.close()
@staticmethod
def load_saved(product):
f = open(product + '_extractor.pickle', 'rb')
extractor = pickle.load(f)
f.close()
return extractor
@staticmethod
def wordnet_relatedness(t1, t2):
......@@ -192,6 +271,70 @@ class TargetExtractor:
return root
# product has to be at idx 0
# targets have to be sorted in descending order based on counts (excluding product)
@staticmethod
def get_product_tree(dep_matrix, targets):
remaining_targets = [idx for idx in range(len(targets))]
root = Node(targets[remaining_targets.pop(0)])
n_null = 3
dependencies = [None] * n_null + [TargetExtractor.get_significant_dependence(
idx, dep_matrix, ignore_idx=[0]+list(range(idx+1, len(targets)))) for idx in range(n_null, len(targets))]
print(dependencies)
while remaining_targets:
idx = remaining_targets.pop(0)
t_node = Node(targets[idx], parent=root)
t_node.idx = idx
dependants = [(d_idx, dep[1]) for d_idx, dep in enumerate(dependencies) if dep and dep[0] == idx]
print(t_node, [targets[i] for i, _ in dependants])
for d_idx, _ in sorted(dependants, key=lambda p: p[1], reverse=True):
if d_idx not in remaining_targets:
continue
# parent = root
# if not t_node.children or any(TargetExtractor.are_correlated(d_idx, c.idx, dep_matrix, ignore_idx=[0, idx]) for c in t_node.children):
# parent = t_node
d_node = Node(targets[d_idx], parent=t_node)
d_node.idx = d_idx
remaining_targets.remove(d_idx)
# for idx, t in enumerate(targets):
# if idx == 0:
# continue
# dep_idx =
# parent = next((d for d in root.descendants if d.idx == dep_idx), root)
# node = Node(t, parent=parent)
# node.idx = idx
return root
def get_product_tree2(self):
root = Node(self.aspects[0])
for idx in range(1, TargetExtractor.N_DIRECT_FEATURES + 1):
node = Node(self.aspects[idx], parent=root)
node.idx = idx
unassigned = {idx for idx in range(TargetExtractor.N_DIRECT_FEATURES + 1, len(self.aspects))}
for idx in range(1, len(self.aspects)): # for each feature in order from highest to lowest count
print(self.aspects[idx])
# create node for aspect with parent root if it is unassigned
node = next((n for n in root.descendants if n.idx == idx), None)
if not node:
node = Node(self.aspects[idx], parent=root)
node.idx = idx
unassigned.remove(idx)
# get highest dependant from unassigned aspects if there exists a significant one
dep_idx = self.get_dependant(idx, [], unassigned)
while dep_idx is not None:
print(' ', self.aspects[dep_idx])
# assign dep as subfeature of t
dep_node = Node(self.aspects[dep_idx], parent=node)
dep_node.idx = dep_idx
unassigned.remove(dep_idx)
# get highest dependant from remaining targets if there exists a significant one
dep_idx = self.get_dependant(idx, [child.idx for child in node.children], unassigned)
return root
@staticmethod
def kruskal(vertices, edges):
result = set()
......@@ -217,6 +360,81 @@ class TargetExtractor:
word, tag = pos_tagged
return tag.startswith('NN') and word.lower() not in string.punctuation and word not in stop_words
# true if term is not a preposition and does not include special characters
@staticmethod
def is_valid_term(pos_tagged):
alpha_numeric_pat = '^\w+$'
word, tag = pos_tagged
return tag != 'IN' and re.match(alpha_numeric_pat, word)
@staticmethod
def print_relations(target_indices, dep_matrix, targets):
idx_pairs = {frozenset((idx1, idx2)) for idx1 in target_indices for idx2 in target_indices if idx1 != idx2}
for idx1, idx2 in idx_pairs:
t1 = targets[idx1]
t2 = targets[idx2]
print('{} {:.4f} {}'.format(t1, dep_matrix[idx1][idx2], t2))
print('{} {:.4f} {}'.format(' ' * len(t1), dep_matrix[idx2][idx1], ' ' * len(t2)))
print('')
def print_relations_from(self, aspect):
idx = self.aspects.index(aspect)
rels = self.relatedness_matrix[idx].copy()
rels.mask = False
for rel_idx in sorted(range(len(self.aspects)), key=lambda i: rels[i], reverse=True):
print('{:.4f}'.format(rels[rel_idx]), self.aspects[rel_idx])
def get_dependant(self, idx, child_indices, unassigned_indices):
ignore_idx = [0] + child_indices
max_dependant = (None, 0)
for u_idx in unassigned_indices:
# print(' ', self.aspects[u_idx])
dependence = self.get_significant_dependence(u_idx, ignore_idx=ignore_idx)
if dependence is not None:
dep_idx, score = dependence
# print(' ', self.aspects[dep_idx], score)
if dep_idx == idx and score > max_dependant[1]:
max_dependant = (u_idx, score)
return max_dependant[0]
def get_significant_dependence(self, idx, ignore_idx=None):
if not ignore_idx:
ignore_idx = [0]
deps = self.relatedness_matrix[idx].copy()
for i in ignore_idx:
deps[i] = np.ma.masked
fst_high_outlier = TargetExtractor.high_outlier_idx(deps)
# print(' ', fst_high_outlier)
if fst_high_outlier is not None:
deps[fst_high_outlier[0]] = np.ma.masked
snd_high_outlier = TargetExtractor.high_outlier_idx(deps) # np.delete(deps, fst_high_outlier)
# print(' ', snd_high_outlier)
if snd_high_outlier is None:
return fst_high_outlier
return None
@staticmethod
def high_outlier_idx(arr):
q1 = np.nanquantile(np.ma.filled(arr, np.NaN), 0.25)
q3 = np.nanquantile(np.ma.filled(arr, np.NaN), 0.75)
max_idx = np.nanargmax(np.ma.filled(arr, np.NaN))
lim = q3 + TargetExtractor.OUTLIER_COEFFICIENT * (q3 - q1)
# print(' ', arr, arr[max_idx], q1, q3, lim)
return (max_idx, arr[max_idx] / lim) if arr[max_idx] > lim else None
@staticmethod
def are_correlated(idx1, idx2, dep_matrix, ignore_idx=None):
if not ignore_idx:
ignore_idx = []
mask = [i in ignore_idx for i in range(len(dep_matrix[idx1]))]
deps1 = np.ma.masked_array(dep_matrix[idx1], mask=mask)
deps2 = np.ma.masked_array(dep_matrix[idx2], mask=mask)
return TargetExtractor.in_q3(deps1, idx2) and TargetExtractor.in_q3(deps2, idx1)
@staticmethod
def in_q3(arr, idx):
return arr[idx] >= np.quantile(arr, 0.75)
class Synset:
......@@ -274,11 +492,29 @@ class Synset:
return None
laptop_texts = obtain_texts('data/laptop_reviews.tsv', 'review_body')
laptop_extractor = TargetExtractor('laptop', laptop_texts)
camera_texts = obtain_texts('data/camera_prepared_data.tsv', 'review_body')
camera_extractor = TargetExtractor('camera', camera_texts, parent=laptop_extractor)
tree, syns = camera_extractor.get_tree_and_synonyms()
electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'review_body')[:300000]
electronics_extractor = TargetExtractor('device', electronics_texts)
texts = obtain_texts('data/verified_laptop_reviews.tsv', 'reviewText')
extractor = TargetExtractor('laptop', texts, parent=electronics_extractor)
tree, syns = extractor.get_tree_and_synonyms()
print(RenderTree(tree))
print(syns)
extractor.save()
# np.set_printoptions(precision=4, suppress=True, threshold=np.inf)
# extractor: TargetExtractor = TargetExtractor.load_saved()
# extractor.relatedness_matrix = extractor.get_scaled_relations()
# tree, _ = extractor.get_tree_and_synonyms()
# print(RenderTree(tree))
# print(extractor.aspects)
# print(extractor.relatedness_matrix)
# extractor.save()
# print(extractor.aspects)
# print(extractor.relatedness_matrix)
# extractor.save()
# for a in ['lcd_screen', 'viewfinder', 'lens', 'image_stabilization']:
# print(a)
# extractor.print_relations_from(a)
# print(extractor.counts['lcd_screen'], extractor.counts['viewfinder'])
# print(RenderTree(extractor.get_product_tree2()))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment