Commit ab2bf2ed authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Integrated synonyms to target_extractor

parent 4f0969e9
......@@ -164,7 +164,7 @@ def get_strengths(qbaf):
#############
all_reviews = pd.read_csv('amazon_data/camera_prepared_data.tsv', sep='\t', error_bad_lines=False)
all_reviews = pd.read_csv('target_extraction/data/camera_prepared_data.tsv', sep='\t', error_bad_lines=False)
camera_strengths = []
star_rating_averages = []
......
......@@ -3,7 +3,6 @@ import gzip
import json
import re
output_location = 'target_extraction/data/camera_metadata.tsv'
def parse(path):
g = gzip.open(path, 'rb')
......@@ -20,31 +19,35 @@ def get_df(path):
return pd.DataFrame.from_dict(df, orient='index')
metadata = get_df('amazon_data/meta_Electronics.json.gz')
metadata = get_df('amazon_data/meta_Musical_Instruments.json.gz')
output_location = 'target_extraction/data/guitar_metadata.tsv'
for col in metadata.columns:
print(col)
# get metadata for camera products
metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
# try to filter out camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
r = re.compile(filter_pat)
metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
for _, row in metadata.head(20).iterrows():
print('features:', row['feature'])
print('description:', row['description'])
print('tech1:', row['tech1'])
print('tech2:', row['tech2'])
# get metadata for sunglasses
metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
# # get metadata for camera products
# metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
#
# # try to filter out camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
# filter_pat = ''
# for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter_pat += word_filter + '|'
# filter_pat = filter_pat[:-1]
# r = re.compile(filter_pat)
# metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
# metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
#
# for _, row in metadata.head(20).iterrows():
# print('features:', row['feature'])
# print('description:', row['description'])
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
metadata.to_csv(output_location, sep='\t', index=False)
print('Successfully prepared data for', len(metadata.index), 'products')
import requests
import threading
from anytree import Node
import sys
import time
class ConceptNet:
......@@ -22,6 +22,7 @@ class ConceptNet:
def get_relatedness(self, f1, f2):
uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_'))
obj = requests.get(self.url + uri).json()
time.sleep(0.5) # only 3600 requests allowed / hour
return obj['value']
def append_result(self, feature, rel, result_set, lock):
......
......@@ -3,7 +3,7 @@ import ast
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet, wordnet_ic
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
from gensim.models.phrases import Phrases, Phraser
......@@ -11,7 +11,7 @@ from concept_net import ConceptNet
from anytree import Node, RenderTree
import itertools
import numpy as np
from sklearn.preprocessing import normalize
import re
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
......@@ -19,39 +19,44 @@ cnet = ConceptNet()
class TargetExtractor:
MIN_RELATEDNESS = 0.1
MIN_RELATEDNESS = 0.3
N_ASPECTS = 50
MIN_DIRECT_GAIN = 0.1
DEPTH_COST = 0.3
def __init__(self, product, metadata_path):
self.product = product
self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False)
self.features = self.get_all('feature')
self.descriptions = self.get_all('description')
self.tech1 = self.get_all('tech1')
self.tech2 = self.get_all('tech2')
features = self.get_all('feature')
# descriptions = self.get_all('description')
# tech1 = self.get_all('tech1')
# tech2 = self.get_all('tech2')
# tokenize and normalize phrases
self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in self.features]
for phrase in features]
# train bigram map
tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases)
# mine aspects
aspects, counts = self.get_related_nouns(30)
print(aspects)
aspects, counts = self.get_related_nouns(TargetExtractor.N_ASPECTS)
# obtain synonyms
synset = Synset(aspects)
self.syn_dict = synset.get_dict(counts)
print(self.syn_dict)
# remove aspect synonyms
aspects = [aspect for aspect in aspects if aspect in self.syn_dict.keys()]
counts = {aspect: sum(counts[syn] for syn in self.syn_dict[aspect])
for aspect, count in counts.items() if aspect in aspects}
# extract relationships between aspects
relatedness_matrix = self.get_relations(aspects, counts)
# extract aspect tree
self.tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
print(RenderTree(self.tree))
def get_tree(self):
return self.tree
......@@ -66,7 +71,8 @@ class TargetExtractor:
bigrams = self.bigrammer[phrase]
for pair in pair_counts:
t1, t2 = pair
if t1 in bigrams and t2 in bigrams:
if (any(term in bigrams for term in self.syn_dict[t1]) and
any(term in bigrams for term in self.syn_dict[t2])):
pair_counts[pair] += 1
relatedness_matrix = np.zeros((len(targets), len(targets)))
......@@ -89,7 +95,8 @@ class TargetExtractor:
nouns = []
for phrase in self.phrases:
pos_tags = pos_tag(phrase)
bigrams = self.bigrammer[phrase]
bigrams = [re.sub('_*' + self.product + '_*', '', bigram) if bigram != self.product else bigram
for bigram in self.bigrammer[phrase]]
word_idx = 0
for token in bigrams:
if '_' in token:
......@@ -117,13 +124,19 @@ class TargetExtractor:
return targets, {target: count for target, count in common}
@staticmethod
def wordnet_relatedness(t1, t2):
fst = wordnet.synset(t1 + '.n.01')
snd = wordnet.synset(t2 + '.n.01')
return fst.wup_similarity(snd)
@staticmethod
def spanning_tree_from_root(vertices, weights, root_idx=0):
root = Node(vertices[root_idx])
for idx in np.flip(np.argsort(weights[root_idx])):
if idx == root_idx:
continue
gain = weights[root_idx][idx]
gain = max(TargetExtractor.MIN_DIRECT_GAIN, weights[root_idx][idx])
parent = root
for branch_node in root.descendants:
min_scaled_weight = min(weights[n.idx][idx] * pow(TargetExtractor.DEPTH_COST, branch_node.depth)
......@@ -228,6 +241,4 @@ class Synset:
return group
return None
extractor = TargetExtractor('camera', 'data/camera_metadata.tsv')
extractor.get_tree()
print(TargetExtractor('camera', 'data/camera_metadata.tsv').get_tree())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment