Commit 8a8afb1e authored by Joel Oksanen's avatar Joel Oksanen

Replace product synonym threshold with product absoption

parent 543b0fd1
......@@ -19,6 +19,7 @@ from agent.target_extraction.BERT.relation_extractor.bert_rel_extractor import B
from pathos.multiprocessing import ProcessingPool as Pool
import itertools
from time import time
from sklearn.metrics.pairwise import cosine_similarity
np.set_printoptions(precision=4, threshold=np.inf, suppress=True)
stop_words = stopwords.words('english')
......@@ -147,9 +148,9 @@ class TargetExtractor:
MAX_DEPTH = 2
# word2vec
MIN_TERM_COUNT = 100
MIN_TERM_COUNT = 0
SYNONYM_SIMILARITY = 0.21
SYNONYM_SIMILARITY_PRODUCT = 0.08
PRODUCT_ABSORPTION_MULT = 3 # see product_absorption()
WV_SIZE = 300
WV_WINDOW = 4
......@@ -213,7 +214,7 @@ class TargetExtractor:
self.relatedness_matrix = self.get_bert_relations()
print('extracting aspect tree...')
self.tree = self.get_product_tree3()
self.tree = self.get_product_tree()
te = time()
print('Ontology extraction took {} seconds'.format(te - t_syn))
......@@ -256,26 +257,31 @@ class TargetExtractor:
print(' extracting relations with BERT...')
dataset = PairRelDataset.from_df(df)
bert_extractor = BertRelExtractor.load_saved(rel_extractor_path)
aspect_counts = np.array([self.counts[aspect] for aspect in self.aspects])
prob_matrix, count_matrix = bert_extractor.extract_relations(len(self.aspects), self.aspect_index_map(),
aspect_counts, dataset=dataset)
self.get_aspect_counts(), dataset=dataset)
# absorb non-features to product
m = prob_matrix / self.get_aspect_counts() # scale rows by aspect counts
non_features = self.product_absorption(m)
for idx in non_features:
# absorb probabilities
prob_matrix[0] += prob_matrix[idx]
prob_matrix[:, 0] += prob_matrix[:, idx]
# absorb synonyms and counts
self.syn_dict[self.aspects[0]].update(self.syn_dict[self.aspects[idx]])
self.counts[self.aspects[0]] += self.counts[self.aspects[idx]]
del self.syn_dict[self.aspects[idx]]
del self.counts[self.aspects[idx]]
prob_matrix = np.delete(np.delete(prob_matrix, non_features, axis=0), non_features, axis=1)
self.aspects = [a for idx, a in enumerate(self.aspects) if idx not in non_features]
# recalculate relatedness matrix
self.relatedness_matrix = prob_matrix / self.get_aspect_counts()
self.relatedness_matrix = prob_matrix / aspect_counts # scale rows by aspect counts
return self.relatedness_matrix
def extract_synset(self):
for idx, aspect in enumerate(self.aspects):
if idx == 0:
continue
synset = {idx}
aspect_dependence = self.aspect_dependence(idx)
for syn_idx in self.get_syns(aspect):
if syn_idx < idx and syn_idx != aspect_dependence:
synset.add(syn_idx)
self.print_relations_from(aspect)
if len(synset) > 1:
return synset
return None
def get_aspect_counts(self):
return np.array([self.counts[aspect] for aspect in self.aspects])
def get_syns(self, aspect):
return {idx for idx, a in enumerate(self.aspects)
......@@ -329,44 +335,7 @@ class TargetExtractor:
f.close()
return extractor
def closest_relative_for_idx(self, idx):
return np.argmax(self.relatedness_matrix[idx])
def aspect_dependence(self, idx):
row = self.relatedness_matrix[idx]
max_idx1, max_idx2 = row[1:].argsort()[-2:][::-1] + 1
if max_idx1 < idx and row[max_idx1] >= row[max_idx2] * TargetExtractor.SUBFEATURE_MULT:
return max_idx1
else:
return None
def get_product_tree(self):
root = Node(self.aspects[0])
root.idx = 0
for idx in range(1, len(self.aspects)): # for each feature in order from highest to lowest count
dep_idx = self.aspect_dependence(idx)
if dep_idx is not None:
parent = next(n for n in root.descendants if n.idx == dep_idx)
else:
parent = root
node = Node(self.aspects[idx], parent=parent)
node.idx = idx
self.node_map = {n.idx: n for n in (root,) + root.descendants}
return root
def aspect_dependence_with_strength(self, idx):
row = self.relatedness_matrix[idx]
max_idx1, max_idx2 = row[1:].argsort()[-2:][::-1] + 1
if (row[max_idx1] >= row[max_idx2] * TargetExtractor.SUBFEATURE_MULT and
self.counts[self.aspects[max_idx1]] * TargetExtractor.COUNT_MULT > self.counts[self.aspects[idx]]):
return max_idx1, row[max_idx1]
else:
return None
def aspect_dependence_with_strength2(self, idx):
row = self.relatedness_matrix[idx]
max_idx1 = np.argmax(row[1:]) + 1
if (row[max_idx1] >= row[0] and
......@@ -375,43 +344,11 @@ class TargetExtractor:
else:
return None
def get_product_tree2(self):
root = Node(self.aspects[0])
root.idx = 0
deps = {idx: self.aspect_dependence_with_strength2(idx) for idx in range(1, len(self.aspects))}
for no_dep_idx in {idx for idx, dep in deps.items() if dep is None}:
node = Node(self.aspects[no_dep_idx], parent=root)
node.idx = no_dep_idx
del deps[no_dep_idx]
sorted_deps = sorted(deps.items(), key=lambda x: x[1][1], reverse=True)
for idx, (dep, _) in sorted_deps:
n = next((n for n in root.descendants if n.idx == idx), None)
dep_n = next((n for n in root.descendants if n.idx == dep), None)
if dep_n is None:
dep_n = Node(self.aspects[dep], parent=root)
dep_n.idx = dep
if n is not None:
if dep_n not in n.descendants and dep_n.depth + (max(c.depth for c in n.descendants) if n.descendants else 0) <= TargetExtractor.MAX_DEPTH:
n.parent = dep_n
else:
if dep_n.depth < TargetExtractor.MAX_DEPTH:
n = Node(self.aspects[idx], parent=dep_n)
else:
n = Node(self.aspects[idx], parent=root)
n.idx = idx
return root
def get_product_tree3(self):
def get_product_tree(self):
root = Node(self.aspects[0])
root.idx = 0
deps = {idx: self.aspect_dependence_with_strength2(idx) for idx in range(1, len(self.aspects))}
deps = {idx: self.aspect_dependence_with_strength(idx) for idx in range(1, len(self.aspects))}
for no_dep_idx in {idx for idx, dep in deps.items() if dep is None}:
node = Node(self.aspects[no_dep_idx], parent=root)
......@@ -421,6 +358,7 @@ class TargetExtractor:
sorted_deps = sorted(deps.items(), key=lambda x: x[1][1], reverse=True)
for idx, (dep_idx, _) in sorted_deps:
# print(self.aspects[idx], self.aspects[dep_idx])
if any(n for n in root.descendants if n.idx == idx):
continue
......@@ -431,6 +369,7 @@ class TargetExtractor:
else:
n = Node(self.aspects[idx], parent=dep_n.parent)
else:
print(self.aspects[idx], self.aspects[dep_idx])
dep_n = Node(self.aspects[dep_idx], parent=root)
dep_n.idx = dep_idx
n = Node(self.aspects[idx], parent=dep_n)
......@@ -455,21 +394,48 @@ class TargetExtractor:
for rel_idx in sorted(range(len(self.aspects)), key=lambda i: rels[i], reverse=True)[:20]:
print(' {:.4f}'.format(rels[rel_idx]), self.aspects[rel_idx])
def print_relations_to(self, aspect):
idx = self.aspects.index(aspect)
rels = self.relatedness_matrix[:, idx].copy()
print(' relations to {}:'.format(aspect))
for rel_idx in sorted(range(len(self.aspects)), key=lambda i: rels[i], reverse=True)[:20]:
print(' {:.4f}'.format(rels[rel_idx]), self.aspects[rel_idx])
def product_absorption(self, m):
product_rels = m[:, 0].copy()
vals = np.empty(len(self.aspects) - 1)
for idx in range(1, len(self.aspects)):
vals[idx-1] = self.similarity(self.aspects[idx], self.aspects[0]) / product_rels[idx]
fst_q = np.quantile(vals, 0.75)
snd_q = np.quantile(vals, 0.25)
threshold = snd_q + TargetExtractor.PRODUCT_ABSORPTION_MULT * (fst_q - snd_q)
return [idx for idx in range(1, len(self.aspects)) if vals[idx-1] > threshold]
def get_syn_dict(self):
all_pairs = {frozenset((t1, t2)) for t1 in self.aspects for t2 in self.aspects if t1 != t2}
syn_pairs = {frozenset((t1, t2)) for t1, t2 in all_pairs if self.are_syns(t1, t2)}
synset = Synset(self.aspects, syn_pairs, self.product)
return synset.get_dict(self.counts)
def similarity(self, t1, t2):
return self.wv.relative_cosine_similarity(t1, t2) + self.wv.relative_cosine_similarity(t2, t1)
def are_syns(self, t1, t2):
if wnl.lemmatize(t1) == wnl.lemmatize(t2):
return True
if self.product in [t1, t2]:
return (self.wv.relative_cosine_similarity(t1, t2) >= TargetExtractor.SYNONYM_SIMILARITY_PRODUCT or
self.wv.relative_cosine_similarity(t2, t1) >= TargetExtractor.SYNONYM_SIMILARITY_PRODUCT)
else:
sim_sum = self.wv.relative_cosine_similarity(t1, t2) + self.wv.relative_cosine_similarity(t2, t1)
return sim_sum >= TargetExtractor.SYNONYM_SIMILARITY
return self.similarity(t2, t1) >= TargetExtractor.SYNONYM_SIMILARITY
def get_feature_similarity(self, m, idx1, idx2):
r1 = np.delete(m[idx1], [idx1, idx2]).reshape(1, -1)
r2 = np.delete(m[idx2], [idx1, idx2]).reshape(1, -1)
c1 = np.delete(m[:, idx1], [idx1, idx2]).reshape(1, -1)
c2 = np.delete(m[:, idx2], [idx1, idx2]).reshape(1, -1)
return cosine_similarity((c1 - r1), (c2 - r2))[0][0]
def get_inverse_co_occurrence(self, m, idx1, idx2):
return (self.counts[self.aspects[idx1]] * self.counts[self.aspects[idx2]]) / (m[idx1][idx2] * m[idx2][idx1])
class Synset:
......
......@@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = ['192.168.1.104']
ALLOWED_HOSTS = ['192.168.1.148']
# Application definition
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment