Commit 7d9fd221 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Added parent target extractors

parent ab2bf2ed
import pandas as pd
import gzip
import json
import re
MAX_ITEMS = 150000
def parse(path):
......@@ -16,17 +17,22 @@ def get_df(path):
for d in parse(path):
df[i] = d
i += 1
if i == 1000000:
break
return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/meta_Musical_Instruments.json.gz')
output_location = 'target_extraction/data/guitar_metadata.tsv'
metadata = get_df('amazon_data/Electronics.json.gz')
output_location = 'target_extraction/data/electronics_reviews.tsv'
for col in metadata.columns:
print(col)
# get metadata for sunglasses
metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
metadata = metadata.sample(frac=1).reset_index(drop=True)
metadata = metadata.head(MAX_ITEMS)
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
# # get metadata for camera products
# metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
......@@ -50,4 +56,4 @@ metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, reg
# print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'products')
print('Successfully prepared data for', len(metadata.index), 'reviews')
......@@ -18,61 +18,69 @@ wnl = WordNetLemmatizer()
cnet = ConceptNet()
def obtain_texts(path, col):
file = pd.read_csv(path, sep='\t', error_bad_lines=False)
return [text for _, text in file[col].items() if not pd.isnull(text)]
#for text in ast.literal_eval(texts)]
class TargetExtractor:
MIN_RELATEDNESS = 0.3
N_ASPECTS = 50
N_ASPECTS = 30
MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3
FREQ_OVER_PARENT = 2 # target must appear x times more frequently than in parent
def __init__(self, product, metadata_path):
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def __init__(self, product, texts, parent=None):
self.product = product
self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False)
features = self.get_all('feature')
# descriptions = self.get_all('description')
# tech1 = self.get_all('tech1')
# tech2 = self.get_all('tech2')
self.parent = parent
# tokenize and normalize phrases
self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in features]
for phrase in texts]
# train bigram map
tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases)
# count terms
self.counter = self.count_nouns()
self.total_count = sum(self.counter.values())
print(parent, self.total_count)
def get_tree_and_synonyms(self):
# mine aspects
aspects, counts = self.get_related_nouns(TargetExtractor.N_ASPECTS)
aspects, counts = self.get_related_nouns(self.counter)
print(aspects)
# obtain synonyms
synset = Synset(aspects)
self.syn_dict = synset.get_dict(counts)
syn_dict = synset.get_dict(counts)
# remove aspect synonyms
aspects = [aspect for aspect in aspects if aspect in self.syn_dict.keys()]
counts = {aspect: sum(counts[syn] for syn in self.syn_dict[aspect])
aspects = [aspect for aspect in aspects if aspect in syn_dict.keys()]
counts = {aspect: sum(counts[syn] for syn in syn_dict[aspect])
for aspect, count in counts.items() if aspect in aspects}
# extract relationships between aspects
relatedness_matrix = self.get_relations(aspects, counts)
relatedness_matrix = self.get_relations(aspects, counts, syn_dict)
# extract aspect tree
self.tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
def get_tree(self):
return self.tree
tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
def get_synonyms(self):
return self.syn_dict
return tree, syn_dict
def get_relations(self, targets, counts):
def get_relations(self, targets, counts, syn_dict):
pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)}
for phrase in self.phrases:
bigrams = self.bigrammer[phrase]
for pair in pair_counts:
t1, t2 = pair
if (any(term in bigrams for term in self.syn_dict[t1]) and
any(term in bigrams for term in self.syn_dict[t2])):
if (any(term in bigrams for term in syn_dict[t1]) and
any(term in bigrams for term in syn_dict[t2])):
pair_counts[pair] += 1
relatedness_matrix = np.zeros((len(targets), len(targets)))
......@@ -91,7 +99,7 @@ class TargetExtractor:
relatedness_matrix = np.divide(relatedness_matrix, np.amax(relatedness_matrix))
return relatedness_matrix
def get_related_nouns(self, n):
def count_nouns(self):
nouns = []
for phrase in self.phrases:
pos_tags = pos_tag(phrase)
......@@ -109,20 +117,34 @@ class TargetExtractor:
nouns.append(token)
word_idx += 1
c = Counter(nouns)
common = c.most_common(n)
return Counter(nouns)
# filter terms not related to product
common = [(term, count) for term, count in common if
cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS]
def get_related_nouns(self, counter):
common = counter.most_common()
term_counts = []
while len(term_counts) < TargetExtractor.N_ASPECTS:
term, count = common.pop(0)
print(term)
# filter terms not related to the product
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
if (not self.parent or self.parent.frequency_for_term(term) == 0 or
self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
TargetExtractor.FREQ_OVER_PARENT):
term_counts.append((term, count))
terms = [term for term, count in term_counts]
# bring product to front of list
targets = [target for target, _ in common]
if self.product in targets:
targets.remove(self.product)
targets.insert(0, self.product)
if self.product in terms:
terms.remove(self.product)
else:
terms.pop()
terms.insert(0, self.product)
return terms, {term: count for term, count in term_counts}
return targets, {target: count for target, count in common}
def frequency_for_term(self, term):
return self.counter[term] / self.total_count
@staticmethod
def wordnet_relatedness(t1, t2):
......@@ -174,10 +196,6 @@ class TargetExtractor:
word, tag = pos_tagged
return tag.startswith('NN') and word.lower() not in string.punctuation and word not in stop_words
def get_all(self, col):
return [item for _, items in self.metadata[col].items() if not pd.isnull(items)
for item in ast.literal_eval(items)]
class Synset:
......@@ -241,4 +259,14 @@ class Synset:
return group
return None
print(TargetExtractor('camera', 'data/camera_metadata.tsv').get_tree())
electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'reviewText')
print(1)
electronics_extractor = TargetExtractor('device', electronics_texts)
print(2)
camera_texts = obtain_texts('data/camera_metadata.tsv', 'feature')
print(3)
camera_extractor = TargetExtractor('camera', camera_texts, parent=electronics_extractor)
tree, synonyms = camera_extractor.get_tree_and_synonyms()
print(RenderTree(tree))
print(synonyms)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment