Commit 4ff3515e authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented aspect structuring in target extractor. TODO: handling of synonyms

parent 5039567d
......@@ -8,19 +8,24 @@ class ConceptNet:
url = ''
limit = 5
def find_rels(self, feature, rel):
def find_related(self, feature, rel):
uri = '/query?node=/c/en/{feature}&other=/c/en&rel=/r/{rel}&limit={limit}'.format(feature=feature, rel=rel, limit=self.limit)
obj = requests.get(self.url + uri).json()
unique = set([obj['edges'][i]['end']['label'] for i in range(len(obj['edges']))])
return unique
def find_relations(self, f1, f2):
uri = '/query?node=/c/en/{f1}&other=/c/en/{f2}'.format(f1=f1, f2=f2)
obj = requests.get(self.url + uri).json()
return obj
def get_relatedness(self, f1, f2):
uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_'))
obj = requests.get(self.url + uri).json()
return obj['value']
def append_result(self, feature, rel, result_set, lock):
rels = self.find_rels(feature, rel)
rels = self.find_related(feature, rel)
......@@ -55,6 +60,7 @@ class ConceptNet:
self.parent_check(node, node.parent, synonyms)
return synonyms
def sub_features_for_node(self, node):
......@@ -72,11 +78,3 @@ class ConceptNet:
return features
net = ConceptNet()
# parent = Node(str(sys.argv[1]))
# child = Node(str(sys.argv[2]), parent=parent)
# syns = net.sem_synonyms_for_node(child)
# print(syns)
node = Node('camera')
......@@ -3,42 +3,83 @@ import ast
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
from gensim.models.phrases import Phrases, Phraser
from concept_net import ConceptNet
from anytree import Node, RenderTree
import itertools
import numpy as np
from sklearn.preprocessing import normalize
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()
cnet = ConceptNet()
class TargetExtractor:
def __init__(self, metadata_path):
def __init__(self, product, metadata_path):
self.product = product
self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False)
self.features = self.get_all('feature')
self.descriptions = self.get_all('description')
self.tech1 = self.get_all('tech1')
self.tech2 = self.get_all('tech2')
print(len(self.features), len(self.descriptions), len(self.tech1), len(self.tech2))
n = 50
print('features:', TargetExtractor.get_common_nouns(self.features, n))
# print('descriptions:', TargetExtractor.get_common_nouns(self.descriptions, n))
# print('tech1:', TargetExtractor.get_common_nouns(self.tech1, n))
def get_common_nouns(phrases, n):
# tokenize and normalize phrases
tokenized_normalized = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in phrases]
self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in self.features]
# train bigram map
tokenized_phrases = Phrases(tokenized_normalized)
bigrammer = Phraser(tokenized_phrases)
tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases)
def get_tree(self):
# mine targets
targets, counts = self.get_related_nouns(50)
# extract relationships between targets
relatedness_matrix = self.get_relations(targets, counts)
tree = TargetExtractor.spanning_tree_from_root(targets, relatedness_matrix)
return tree
def get_relations(self, targets, counts):
pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)}
for phrase in self.phrases:
bigrams = self.bigrammer[phrase]
for pair in pair_counts:
t1, t2 = pair
if t1 in bigrams and t2 in bigrams:
pair_counts[pair] += 1
relatedness_matrix = np.zeros((len(targets), len(targets)))
for row in range(0, len(targets) - 1):
for col in range(row + 1, len(targets)):
t1 = targets[row]
t2 = targets[col]
score = pair_counts[(t1, t2)] / (counts[t1] * counts[t2])
relatedness_matrix[row][col] = score
for col in range(0, len(targets) - 1):
for row in range(col + 1, len(targets)):
relatedness_matrix[row][col] = relatedness_matrix[col][row]
relatedness_matrix = np.divide(relatedness_matrix, np.amax(relatedness_matrix))
return relatedness_matrix
def get_related_nouns(self, n):
nouns = []
for phrase in tokenized_normalized:
for phrase in self.phrases:
pos_tags = pos_tag(phrase)
bigrams = bigrammer[phrase]
bigrams = self.bigrammer[phrase]
word_idx = 0
for token in bigrams:
if '_' in token:
......@@ -47,12 +88,59 @@ class TargetExtractor:
word_idx += len(words)
if TargetExtractor.is_noun(pos_tags[word_idx]):
if len(token) > 1 and TargetExtractor.is_noun(pos_tags[word_idx]):
word_idx += 1
c = Counter(nouns)
return c.most_common(n)
common = c.most_common(n)
# filter terms not related to product
common = [(term, count) for term, count in common if
cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS]
# bring product to front of list
targets = [target for target, _ in common]
if self.product in targets:
targets.insert(0, self.product)
return targets, {target: count for target, count in common}
def spanning_tree_from_root(vertices, weights, root_idx=0):
root = Node(vertices[root_idx])
for idx in np.flip(np.argsort(weights[root_idx])):
if idx == root_idx:
gain = weights[root_idx][idx]
parent = root
for branch_node in root.descendants:
min_scaled_weight = min(weights[n.idx][idx] * pow(TargetExtractor.DEPTH_COST, branch_node.depth)
for n in (branch_node,) + branch_node.anchestors if n != root)
if min_scaled_weight > gain:
gain = min_scaled_weight
parent = branch_node
node = Node(vertices[idx], parent=parent)
node.idx = idx
return root
def kruskal(vertices, edges):
result = set()
groups = {vertex: i for i, vertex in enumerate(vertices)}
for u, v in sorted(edges, key=edges.get, reverse=True):
if groups[u] != groups[v]:
result.add((u, v))
TargetExtractor.join_groups(groups, groups[u], groups[v])
return result
def join_groups(groups, i, j):
for v in groups:
if groups[v] == j:
groups[v] = i
def singular(word):
......@@ -67,8 +155,43 @@ class TargetExtractor:
return [item for _, items in self.metadata[col].items() if not pd.isnull(items)
for item in ast.literal_eval(items)]
def extract(self):
class Targets:
def __init__(self, targets):
self.targets = targets
self.groups = {i: {target} for i, target in enumerate(targets)}
self.next_idx = len(targets)
def get(self):
return self.targets
def set_syns(self, syns):
syn_set = {syn for syn in syns if syn in self.targets}
if not self.is_group(syn_set):
i = self.next_idx
self.next_idx += 1
self.groups[i] = syn_set
def is_group(self, syns):
return any(syns in group_syns for group_syns in self.groups.values())
def clear_subgroups(self, syns):
self.groups = {group: group_syns for group, group_syns in self.groups.items() if not group_syns.issubset(syns)}
def get_groups(self):
return [syns for group, syns in self.groups.items()]
def get_syns(word):
syns = {word}
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
return syns
extractor = TargetExtractor('camera', 'data/camera_metadata.tsv')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment