Commit 7d9fd221 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Added parent target extractors

parent ab2bf2ed
import pandas as pd import pandas as pd
import gzip import gzip
import json import json
import re
MAX_ITEMS = 150000
def parse(path): def parse(path):
...@@ -16,17 +17,22 @@ def get_df(path): ...@@ -16,17 +17,22 @@ def get_df(path):
for d in parse(path): for d in parse(path):
df[i] = d df[i] = d
i += 1 i += 1
if i == 1000000:
break
return pd.DataFrame.from_dict(df, orient='index') return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/meta_Musical_Instruments.json.gz') metadata = get_df('amazon_data/Electronics.json.gz')
output_location = 'target_extraction/data/guitar_metadata.tsv' output_location = 'target_extraction/data/electronics_reviews.tsv'
for col in metadata.columns: for col in metadata.columns:
print(col) print(col)
# get metadata for sunglasses metadata = metadata.sample(frac=1).reset_index(drop=True)
metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)] metadata = metadata.head(MAX_ITEMS)
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
# # get metadata for camera products # # get metadata for camera products
# metadata = metadata[metadata['main_cat'] == 'Camera & Photo'] # metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
...@@ -50,4 +56,4 @@ metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, reg ...@@ -50,4 +56,4 @@ metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, reg
# print('tech2:', row['tech2']) # print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False) metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'products') print('Successfully prepared data for', len(metadata.index), 'reviews')
...@@ -18,61 +18,69 @@ wnl = WordNetLemmatizer() ...@@ -18,61 +18,69 @@ wnl = WordNetLemmatizer()
cnet = ConceptNet() cnet = ConceptNet()
def obtain_texts(path, col):
file = pd.read_csv(path, sep='\t', error_bad_lines=False)
return [text for _, text in file[col].items() if not pd.isnull(text)]
#for text in ast.literal_eval(texts)]
class TargetExtractor: class TargetExtractor:
MIN_RELATEDNESS = 0.3 MIN_RELATEDNESS = 0.3
N_ASPECTS = 50 N_ASPECTS = 30
MIN_DIRECT_GAIN = 0.1 MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3 DEPTH_COST = 0.3
FREQ_OVER_PARENT = 2 # target must appear x times more frequently than in parent
def __init__(self, product, metadata_path): # parent is a TargetExtrator of a parent category, eg. > electronics > camera
def __init__(self, product, texts, parent=None):
self.product = product self.product = product
self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False) self.parent = parent
features = self.get_all('feature')
# descriptions = self.get_all('description')
# tech1 = self.get_all('tech1')
# tech2 = self.get_all('tech2')
# tokenize and normalize phrases # tokenize and normalize phrases
self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))] self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in features] for phrase in texts]
# train bigram map # train bigram map
tokenized_phrases = Phrases(self.phrases) tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases) self.bigrammer = Phraser(tokenized_phrases)
# count terms
self.counter = self.count_nouns()
self.total_count = sum(self.counter.values())
print(parent, self.total_count)
def get_tree_and_synonyms(self):
# mine aspects # mine aspects
aspects, counts = self.get_related_nouns(TargetExtractor.N_ASPECTS) aspects, counts = self.get_related_nouns(self.counter)
print(aspects)
# obtain synonyms # obtain synonyms
synset = Synset(aspects) synset = Synset(aspects)
self.syn_dict = synset.get_dict(counts) syn_dict = synset.get_dict(counts)
# remove aspect synonyms # remove aspect synonyms
aspects = [aspect for aspect in aspects if aspect in self.syn_dict.keys()] aspects = [aspect for aspect in aspects if aspect in syn_dict.keys()]
counts = {aspect: sum(counts[syn] for syn in self.syn_dict[aspect]) counts = {aspect: sum(counts[syn] for syn in syn_dict[aspect])
for aspect, count in counts.items() if aspect in aspects} for aspect, count in counts.items() if aspect in aspects}
# extract relationships between aspects # extract relationships between aspects
relatedness_matrix = self.get_relations(aspects, counts) relatedness_matrix = self.get_relations(aspects, counts, syn_dict)
# extract aspect tree # extract aspect tree
self.tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix) tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
def get_tree(self):
return self.tree
def get_synonyms(self): return tree, syn_dict
return self.syn_dict
def get_relations(self, targets, counts): def get_relations(self, targets, counts, syn_dict):
pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)} pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)}
for phrase in self.phrases: for phrase in self.phrases:
bigrams = self.bigrammer[phrase] bigrams = self.bigrammer[phrase]
for pair in pair_counts: for pair in pair_counts:
t1, t2 = pair t1, t2 = pair
if (any(term in bigrams for term in self.syn_dict[t1]) and if (any(term in bigrams for term in syn_dict[t1]) and
any(term in bigrams for term in self.syn_dict[t2])): any(term in bigrams for term in syn_dict[t2])):
pair_counts[pair] += 1 pair_counts[pair] += 1
relatedness_matrix = np.zeros((len(targets), len(targets))) relatedness_matrix = np.zeros((len(targets), len(targets)))
...@@ -91,7 +99,7 @@ class TargetExtractor: ...@@ -91,7 +99,7 @@ class TargetExtractor:
relatedness_matrix = np.divide(relatedness_matrix, np.amax(relatedness_matrix)) relatedness_matrix = np.divide(relatedness_matrix, np.amax(relatedness_matrix))
return relatedness_matrix return relatedness_matrix
def get_related_nouns(self, n): def count_nouns(self):
nouns = [] nouns = []
for phrase in self.phrases: for phrase in self.phrases:
pos_tags = pos_tag(phrase) pos_tags = pos_tag(phrase)
...@@ -109,20 +117,34 @@ class TargetExtractor: ...@@ -109,20 +117,34 @@ class TargetExtractor:
nouns.append(token) nouns.append(token)
word_idx += 1 word_idx += 1
c = Counter(nouns) return Counter(nouns)
common = c.most_common(n)
# filter terms not related to product def get_related_nouns(self, counter):
common = [(term, count) for term, count in common if common = counter.most_common()
cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS]
term_counts = []
while len(term_counts) < TargetExtractor.N_ASPECTS:
term, count = common.pop(0)
print(term)
# filter terms not related to the product
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
if (not self.parent or self.parent.frequency_for_term(term) == 0 or
self.frequency_for_term(term) / self.parent.frequency_for_term(term) >
TargetExtractor.FREQ_OVER_PARENT):
term_counts.append((term, count))
terms = [term for term, count in term_counts]
# bring product to front of list # bring product to front of list
targets = [target for target, _ in common] if self.product in terms:
if self.product in targets: terms.remove(self.product)
targets.remove(self.product) else:
targets.insert(0, self.product) terms.pop()
terms.insert(0, self.product)
return terms, {term: count for term, count in term_counts}
return targets, {target: count for target, count in common} def frequency_for_term(self, term):
return self.counter[term] / self.total_count
@staticmethod @staticmethod
def wordnet_relatedness(t1, t2): def wordnet_relatedness(t1, t2):
...@@ -174,10 +196,6 @@ class TargetExtractor: ...@@ -174,10 +196,6 @@ class TargetExtractor:
word, tag = pos_tagged word, tag = pos_tagged
return tag.startswith('NN') and word.lower() not in string.punctuation and word not in stop_words return tag.startswith('NN') and word.lower() not in string.punctuation and word not in stop_words
def get_all(self, col):
return [item for _, items in self.metadata[col].items() if not pd.isnull(items)
for item in ast.literal_eval(items)]
class Synset: class Synset:
...@@ -241,4 +259,14 @@ class Synset: ...@@ -241,4 +259,14 @@ class Synset:
return group return group
return None return None
print(TargetExtractor('camera', 'data/camera_metadata.tsv').get_tree())
electronics_texts = obtain_texts('data/electronics_reviews.tsv', 'reviewText')
print(1)
electronics_extractor = TargetExtractor('device', electronics_texts)
print(2)
camera_texts = obtain_texts('data/camera_metadata.tsv', 'feature')
print(3)
camera_extractor = TargetExtractor('camera', camera_texts, parent=electronics_extractor)
tree, synonyms = camera_extractor.get_tree_and_synonyms()
print(RenderTree(tree))
print(synonyms)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment