Commit a2ecd08d authored by Joel Oksanen's avatar Joel Oksanen

Fixed some synonym bugs in target extractor

parent 34e90902
......@@ -24,20 +24,20 @@ def get_df(path):
pd.set_option('display.max_colwidth', None)
category = 'Acoustic Guitars'
category = 'Cardigans'
metadata_iter = pd.read_json('amazon_data/meta_Musical_Instruments.json', lines=True, chunksize=1000)
metadata_iter = pd.read_json('amazon_data/meta_Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
metadata = pd.concat([metadata[metadata['category'].apply(lambda cl: type(cl) is list and category in cl)]
for metadata in metadata_iter])
print(len(metadata.index))
review_iter = pd.read_json('amazon_data/Musical_Instruments.json', lines=True, chunksize=1000)
review_iter = pd.read_json('amazon_data/Clothing_Shoes_and_Jewelry.json', lines=True, chunksize=1000)
reviews = pd.concat([reviews[reviews['asin'].isin(metadata['asin'])] for reviews in review_iter])
print(len(reviews.index))
reviews.to_csv('target_extraction/data/verified_acoustic_guitar_reviews.tsv', sep='\t', index=False)
reviews.to_csv('target_extraction/data/verified_cardigan_reviews.tsv', sep='\t', index=False)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
......
......@@ -234,8 +234,6 @@ class BertRelExtractor:
count_matrix[snd_idx][fst_idx] += 1
count_matrix[fst_idx][snd_idx] += 1
prob_matrix = (prob_matrix.T / aspect_counts).T # scale rows by aspect counts
return prob_matrix
return prob_matrix, count_matrix
......@@ -39,7 +39,8 @@ class TargetExtractor:
# word2vec
MIN_TERM_COUNT = 100
SYNONYM_SIMILARITY = 0.10
SYNONYM_SIMILARITY = 0.12
SYNONYM_SIMILARITY_PRODUCT = 0.09
WV_SIZE = 100
WV_WINDOW = 7
......@@ -82,18 +83,20 @@ class TargetExtractor:
print('mining aspects...')
# mine aspects
aspects, counts = self.get_aspects(self.counter)
self.aspects, self.counts = self.get_aspects(self.counter)
print('extracting synonyms...')
# obtain synonyms
syn_pairs = self.get_syn_pairs(aspects, self.wv)
synset = Synset(aspects, syn_pairs, self.product)
self.syn_dict = synset.get_dict(counts)
syn_pairs = self.get_syn_pairs()
synset = Synset(self.aspects, syn_pairs, self.product)
self.syn_dict = synset.get_dict(self.counts)
# remove aspect synonyms and reorder list based on sum of all synonym counts
aspects = [aspect for aspect in aspects if aspect in self.syn_dict.keys()]
self.counts = {aspect: sum(counts[syn] for syn in self.syn_dict[aspect]) for aspect in aspects}
self.aspects = sorted(aspects, key=self.counts.get, reverse=True)
self.aspects = [aspect for aspect in self.aspects if aspect in self.syn_dict.keys()]
self.counts = {aspect: sum(self.counts[syn] for syn in self.syn_dict[aspect]) for aspect in self.aspects}
self.aspects = sorted(self.aspects, key=self.counts.get, reverse=True)
print(self.syn_dict)
self.save()
......@@ -115,15 +118,6 @@ class TargetExtractor:
print(self.relatedness_matrix)
print(RenderTree(self.tree))
def extract_relatedness_matrix(self):
print('extracting relatedness matrix...')
# extract relationships between aspects
self.relatedness_matrix = self.get_bert_relations()
print(self.aspects)
print(self.syn_dict)
print(self.relatedness_matrix)
def save_product_representation(self):
f = open('extracted_products/' + self.product + Product.FILE_EXTENSION, 'wb')
p = Product(self.tree, self.syn_dict)
......@@ -174,10 +168,29 @@ class TargetExtractor:
dataset = PairRelDataset.from_df(df, size=TargetExtractor.MAX_BERT_DATASET_SIZE)
bert_extractor = BertRelExtractor.load_saved(rel_extractor_path)
aspect_counts = np.array([self.counts[aspect] for aspect in self.aspects])
relatedness_matrix = bert_extractor.extract_relations(len(self.aspects), self.aspect_index_map(), aspect_counts,
dataset=dataset)
prob_matrix, count_matrix = bert_extractor.extract_relations(len(self.aspects), self.aspect_index_map(),
aspect_counts, dataset=dataset)
self.relatedness_matrix = (prob_matrix.T / aspect_counts).T # scale rows by aspect counts
return self.relatedness_matrix
def extract_synset(self):
for idx, aspect in enumerate(self.aspects):
if idx == 0:
continue
synset = {idx}
aspect_dependence = self.aspect_dependence(idx)
for syn_idx in self.get_syns(aspect):
if syn_idx < idx and syn_idx != aspect_dependence:
synset.add(syn_idx)
self.print_relations_from(aspect)
if len(synset) > 1:
return synset
return None
return relatedness_matrix
def get_syns(self, aspect):
return {idx for idx, a in enumerate(self.aspects)
if a != aspect and self.wv.relative_cosine_similarity(a, aspect) > TargetExtractor.SYNONYM_SIMILARITY}
def aspect_index_map(self):
return {syn: idx for idx, aspect in enumerate(self.aspects) for syn in self.syn_dict[aspect]}
......@@ -291,10 +304,13 @@ class TargetExtractor:
return None
@staticmethod
def get_syn_pairs(terms, model):
return {frozenset((t1, t2)) for t1 in terms for t2 in terms
if t1 != t2 and model.relative_cosine_similarity(t1, t2) > TargetExtractor.SYNONYM_SIMILARITY}
def get_syn_pairs(self):
return {frozenset((t1, t2)) for t1 in self.aspects for t2 in self.aspects
if t1 != t2 and
(wnl.lemmatize(t1) == wnl.lemmatize(t2) or
self.wv.relative_cosine_similarity(t1, t2) >
(TargetExtractor.SYNONYM_SIMILARITY_PRODUCT if (t1 == self.product or t2 == self.product)
else TargetExtractor.SYNONYM_SIMILARITY))}
def get_word2vec_model(self, size, window, min_count):
model = Word2Vec(self.ngrams(self.phrases), size=size, window=window, min_count=min_count).wv
......@@ -312,15 +328,25 @@ class TargetExtractor:
f.close()
return extractor
def closest_relative_for_idx(self, idx):
return np.argmax(self.relatedness_matrix[idx])
def aspect_dependence(self, idx):
row = self.relatedness_matrix[idx]
max_idx1, max_idx2 = row[1:].argsort()[-2:][::-1] + 1
if max_idx1 < idx and row[max_idx1] >= row[max_idx2] * TargetExtractor.SUBFEATURE_MULT:
return max_idx1
else:
return None
def get_product_tree(self):
root = Node(self.aspects[0])
root.idx = 0
for idx in range(1, len(self.aspects)): # for each feature in order from highest to lowest count
row = self.relatedness_matrix[idx]
max_idx1, max_idx2 = row[1:].argsort()[-2:][::-1] + 1
if max_idx1 < idx and row[max_idx1] >= row[max_idx2] * TargetExtractor.SUBFEATURE_MULT:
parent = next(n for n in root.descendants if n.idx == max_idx1)
dep_idx = self.aspect_dependence(idx)
if dep_idx is not None:
parent = next(n for n in root.descendants if n.idx == dep_idx)
else:
parent = root
node = Node(self.aspects[idx], parent=parent)
......@@ -343,7 +369,8 @@ class TargetExtractor:
def print_relations_from(self, aspect):
idx = self.aspects.index(aspect)
rels = self.relatedness_matrix[idx].copy()
for rel_idx in sorted(range(len(self.aspects)), key=lambda i: rels[i], reverse=True):
print(' relations from {}:'.format(aspect))
for rel_idx in sorted(range(len(self.aspects)), key=lambda i: rels[i], reverse=True)[:5]:
print(' {:.4f}'.format(rels[rel_idx]), self.aspects[rel_idx])
......@@ -368,7 +395,6 @@ class Synset:
groups.append({word})
return groups
# {a, b} and {b, c} become {a, b, c}
@staticmethod
def join_groups(w1, w2, groups):
g1 = Synset.group_for(w1, groups)
......@@ -384,18 +410,6 @@ class Synset:
groups.append(g1.union(g2))
return True
# {a, b} and {b, c} are separate groups unless {a, c}
@staticmethod
def join_identical_groups(w1, w2, groups):
for g1 in [group for group in groups if w1 in group]:
for g2 in [group for group in groups if w2 in group]:
if g1 - {w1} == g2 - {w2}:
groups.remove(g1)
groups.remove(g2)
groups.append(g1.union(g2))
return True
return False
@staticmethod
def group_for(w, groups):
for group in groups:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment