Commit ab2bf2ed authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Integrated synonyms to target_extractor

parent 4f0969e9
...@@ -164,7 +164,7 @@ def get_strengths(qbaf): ...@@ -164,7 +164,7 @@ def get_strengths(qbaf):
############# #############
all_reviews = pd.read_csv('amazon_data/camera_prepared_data.tsv', sep='\t', error_bad_lines=False) all_reviews = pd.read_csv('target_extraction/data/camera_prepared_data.tsv', sep='\t', error_bad_lines=False)
camera_strengths = [] camera_strengths = []
star_rating_averages = [] star_rating_averages = []
......
...@@ -3,7 +3,6 @@ import gzip ...@@ -3,7 +3,6 @@ import gzip
import json import json
import re import re
output_location = 'target_extraction/data/camera_metadata.tsv'
def parse(path): def parse(path):
g = gzip.open(path, 'rb') g = gzip.open(path, 'rb')
...@@ -20,31 +19,35 @@ def get_df(path): ...@@ -20,31 +19,35 @@ def get_df(path):
return pd.DataFrame.from_dict(df, orient='index') return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/meta_Electronics.json.gz') metadata = get_df('amazon_data/meta_Musical_Instruments.json.gz')
output_location = 'target_extraction/data/guitar_metadata.tsv'
for col in metadata.columns: for col in metadata.columns:
print(col) print(col)
# get metadata for camera products # get metadata for sunglasses
metadata = metadata[metadata['main_cat'] == 'Camera & Photo'] metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
# try to filter out camera accessories # # get metadata for camera products
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive', # metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette'] #
filter_pat = '' # # try to filter out camera accessories
for word in filter_words: # filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:] # 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
filter_pat += word_filter + '|' # filter_pat = ''
filter_pat = filter_pat[:-1] # for word in filter_words:
r = re.compile(filter_pat) # word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)] # filter_pat += word_filter + '|'
metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))] # filter_pat = filter_pat[:-1]
# r = re.compile(filter_pat)
for _, row in metadata.head(20).iterrows(): # metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
print('features:', row['feature']) # metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
print('description:', row['description']) #
print('tech1:', row['tech1']) # for _, row in metadata.head(20).iterrows():
print('tech2:', row['tech2']) # print('features:', row['feature'])
# print('description:', row['description'])
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False) metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'products') print('Successfully prepared data for', len(metadata.index), 'products')
import requests import requests
import threading import threading
from anytree import Node import time
import sys
class ConceptNet: class ConceptNet:
...@@ -22,6 +22,7 @@ class ConceptNet: ...@@ -22,6 +22,7 @@ class ConceptNet:
def get_relatedness(self, f1, f2): def get_relatedness(self, f1, f2):
uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_')) uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_'))
obj = requests.get(self.url + uri).json() obj = requests.get(self.url + uri).json()
time.sleep(0.5) # only 3600 requests allowed / hour
return obj['value'] return obj['value']
def append_result(self, feature, rel, result_set, lock): def append_result(self, feature, rel, result_set, lock):
......
...@@ -3,7 +3,7 @@ import ast ...@@ -3,7 +3,7 @@ import ast
from collections import Counter from collections import Counter
from nltk import pos_tag from nltk import pos_tag
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet, wordnet_ic from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
import string import string
from gensim.models.phrases import Phrases, Phraser from gensim.models.phrases import Phrases, Phraser
...@@ -11,7 +11,7 @@ from concept_net import ConceptNet ...@@ -11,7 +11,7 @@ from concept_net import ConceptNet
from anytree import Node, RenderTree from anytree import Node, RenderTree
import itertools import itertools
import numpy as np import numpy as np
from sklearn.preprocessing import normalize import re
stop_words = stopwords.words('english') stop_words = stopwords.words('english')
wnl = WordNetLemmatizer() wnl = WordNetLemmatizer()
...@@ -19,39 +19,44 @@ cnet = ConceptNet() ...@@ -19,39 +19,44 @@ cnet = ConceptNet()
class TargetExtractor: class TargetExtractor:
MIN_RELATEDNESS = 0.1 MIN_RELATEDNESS = 0.3
N_ASPECTS = 50
MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3 DEPTH_COST = 0.3
def __init__(self, product, metadata_path): def __init__(self, product, metadata_path):
self.product = product self.product = product
self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False) self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False)
self.features = self.get_all('feature') features = self.get_all('feature')
self.descriptions = self.get_all('description') # descriptions = self.get_all('description')
self.tech1 = self.get_all('tech1') # tech1 = self.get_all('tech1')
self.tech2 = self.get_all('tech2') # tech2 = self.get_all('tech2')
# tokenize and normalize phrases # tokenize and normalize phrases
self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))] self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in self.features] for phrase in features]
# train bigram map # train bigram map
tokenized_phrases = Phrases(self.phrases) tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases) self.bigrammer = Phraser(tokenized_phrases)
# mine aspects # mine aspects
aspects, counts = self.get_related_nouns(30) aspects, counts = self.get_related_nouns(TargetExtractor.N_ASPECTS)
print(aspects)
# obtain synonyms # obtain synonyms
synset = Synset(aspects) synset = Synset(aspects)
self.syn_dict = synset.get_dict(counts) self.syn_dict = synset.get_dict(counts)
print(self.syn_dict)
# remove aspect synonyms
aspects = [aspect for aspect in aspects if aspect in self.syn_dict.keys()]
counts = {aspect: sum(counts[syn] for syn in self.syn_dict[aspect])
for aspect, count in counts.items() if aspect in aspects}
# extract relationships between aspects # extract relationships between aspects
relatedness_matrix = self.get_relations(aspects, counts) relatedness_matrix = self.get_relations(aspects, counts)
# extract aspect tree
self.tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix) self.tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
print(RenderTree(self.tree))
def get_tree(self): def get_tree(self):
return self.tree return self.tree
...@@ -66,7 +71,8 @@ class TargetExtractor: ...@@ -66,7 +71,8 @@ class TargetExtractor:
bigrams = self.bigrammer[phrase] bigrams = self.bigrammer[phrase]
for pair in pair_counts: for pair in pair_counts:
t1, t2 = pair t1, t2 = pair
if t1 in bigrams and t2 in bigrams: if (any(term in bigrams for term in self.syn_dict[t1]) and
any(term in bigrams for term in self.syn_dict[t2])):
pair_counts[pair] += 1 pair_counts[pair] += 1
relatedness_matrix = np.zeros((len(targets), len(targets))) relatedness_matrix = np.zeros((len(targets), len(targets)))
...@@ -89,7 +95,8 @@ class TargetExtractor: ...@@ -89,7 +95,8 @@ class TargetExtractor:
nouns = [] nouns = []
for phrase in self.phrases: for phrase in self.phrases:
pos_tags = pos_tag(phrase) pos_tags = pos_tag(phrase)
bigrams = self.bigrammer[phrase] bigrams = [re.sub('_*' + self.product + '_*', '', bigram) if bigram != self.product else bigram
for bigram in self.bigrammer[phrase]]
word_idx = 0 word_idx = 0
for token in bigrams: for token in bigrams:
if '_' in token: if '_' in token:
...@@ -117,13 +124,19 @@ class TargetExtractor: ...@@ -117,13 +124,19 @@ class TargetExtractor:
return targets, {target: count for target, count in common} return targets, {target: count for target, count in common}
@staticmethod
def wordnet_relatedness(t1, t2):
fst = wordnet.synset(t1 + '.n.01')
snd = wordnet.synset(t2 + '.n.01')
return fst.wup_similarity(snd)
@staticmethod @staticmethod
def spanning_tree_from_root(vertices, weights, root_idx=0): def spanning_tree_from_root(vertices, weights, root_idx=0):
root = Node(vertices[root_idx]) root = Node(vertices[root_idx])
for idx in np.flip(np.argsort(weights[root_idx])): for idx in np.flip(np.argsort(weights[root_idx])):
if idx == root_idx: if idx == root_idx:
continue continue
gain = weights[root_idx][idx] gain = max(TargetExtractor.MIN_DIRECT_GAIN, weights[root_idx][idx])
parent = root parent = root
for branch_node in root.descendants: for branch_node in root.descendants:
min_scaled_weight = min(weights[n.idx][idx] * pow(TargetExtractor.DEPTH_COST, branch_node.depth) min_scaled_weight = min(weights[n.idx][idx] * pow(TargetExtractor.DEPTH_COST, branch_node.depth)
...@@ -228,6 +241,4 @@ class Synset: ...@@ -228,6 +241,4 @@ class Synset:
return group return group
return None return None
print(TargetExtractor('camera', 'data/camera_metadata.tsv').get_tree())
extractor = TargetExtractor('camera', 'data/camera_metadata.tsv')
extractor.get_tree()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment