Commit 4ff3515e authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented aspect structuring in target extractor. TODO: handling of synonyms

parent 5039567d
...@@ -8,19 +8,24 @@ class ConceptNet: ...@@ -8,19 +8,24 @@ class ConceptNet:
url = 'http://api.conceptnet.io' url = 'http://api.conceptnet.io'
limit = 5 limit = 5
def find_rels(self, feature, rel): def find_related(self, feature, rel):
uri = '/query?node=/c/en/{feature}&other=/c/en&rel=/r/{rel}&limit={limit}'.format(feature=feature, rel=rel, limit=self.limit) uri = '/query?node=/c/en/{feature}&other=/c/en&rel=/r/{rel}&limit={limit}'.format(feature=feature, rel=rel, limit=self.limit)
obj = requests.get(self.url + uri).json() obj = requests.get(self.url + uri).json()
unique = set([obj['edges'][i]['end']['label'] for i in range(len(obj['edges']))]) unique = set([obj['edges'][i]['end']['label'] for i in range(len(obj['edges']))])
return unique return unique
def find_relations(self, f1, f2):
uri = '/query?node=/c/en/{f1}&other=/c/en/{f2}'.format(f1=f1, f2=f2)
obj = requests.get(self.url + uri).json()
return obj
def get_relatedness(self, f1, f2): def get_relatedness(self, f1, f2):
uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_')) uri = '/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'.format(f1=f1.replace(' ','_'), f2=f2.replace(' ','_'))
obj = requests.get(self.url + uri).json() obj = requests.get(self.url + uri).json()
return obj['value'] return obj['value']
def append_result(self, feature, rel, result_set, lock): def append_result(self, feature, rel, result_set, lock):
rels = self.find_rels(feature, rel) rels = self.find_related(feature, rel)
lock.acquire() lock.acquire()
result_set.update(rels) result_set.update(rels)
lock.release() lock.release()
...@@ -55,6 +60,7 @@ class ConceptNet: ...@@ -55,6 +60,7 @@ class ConceptNet:
self.parent_check(node, node.parent, synonyms) self.parent_check(node, node.parent, synonyms)
synonyms.add(node.name)
return synonyms return synonyms
def sub_features_for_node(self, node): def sub_features_for_node(self, node):
...@@ -72,11 +78,3 @@ class ConceptNet: ...@@ -72,11 +78,3 @@ class ConceptNet:
t.join() t.join()
return features return features
net = ConceptNet()
# parent = Node(str(sys.argv[1]))
# child = Node(str(sys.argv[2]), parent=parent)
# syns = net.sem_synonyms_for_node(child)
# print(syns)
node = Node('camera')
print(net.sub_features_for_node(node))
...@@ -3,42 +3,83 @@ import ast ...@@ -3,42 +3,83 @@ import ast
from collections import Counter from collections import Counter
from nltk import pos_tag from nltk import pos_tag
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
import string import string
from gensim.models.phrases import Phrases, Phraser from gensim.models.phrases import Phrases, Phraser
from concept_net import ConceptNet
from anytree import Node, RenderTree
import itertools
import numpy as np
from sklearn.preprocessing import normalize
stop_words = stopwords.words('english') stop_words = stopwords.words('english')
wnl = WordNetLemmatizer() wnl = WordNetLemmatizer()
cnet = ConceptNet()
class TargetExtractor: class TargetExtractor:
MIN_RELATEDNESS = 0.1
DEPTH_COST = 0.3
def __init__(self, metadata_path): def __init__(self, product, metadata_path):
self.product = product
self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False) self.metadata = pd.read_csv(metadata_path, sep='\t', error_bad_lines=False)
self.features = self.get_all('feature') self.features = self.get_all('feature')
self.descriptions = self.get_all('description') self.descriptions = self.get_all('description')
self.tech1 = self.get_all('tech1') self.tech1 = self.get_all('tech1')
self.tech2 = self.get_all('tech2') self.tech2 = self.get_all('tech2')
print(len(self.features), len(self.descriptions), len(self.tech1), len(self.tech2))
n = 50
print('features:', TargetExtractor.get_common_nouns(self.features, n))
# print('descriptions:', TargetExtractor.get_common_nouns(self.descriptions, n))
# print('tech1:', TargetExtractor.get_common_nouns(self.tech1, n))
@staticmethod
def get_common_nouns(phrases, n):
# tokenize and normalize phrases # tokenize and normalize phrases
tokenized_normalized = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))] self.phrases = [[TargetExtractor.singular(w.lower()) for w in word_tokenize(phrase.replace('_', ' '))]
for phrase in phrases] for phrase in self.features]
# train bigram map # train bigram map
tokenized_phrases = Phrases(tokenized_normalized) tokenized_phrases = Phrases(self.phrases)
bigrammer = Phraser(tokenized_phrases) self.bigrammer = Phraser(tokenized_phrases)
def get_tree(self):
# mine targets
targets, counts = self.get_related_nouns(50)
# extract relationships between targets
relatedness_matrix = self.get_relations(targets, counts)
tree = TargetExtractor.spanning_tree_from_root(targets, relatedness_matrix)
print(RenderTree(tree))
return tree
def get_relations(self, targets, counts):
pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)}
for phrase in self.phrases:
bigrams = self.bigrammer[phrase]
for pair in pair_counts:
t1, t2 = pair
if t1 in bigrams and t2 in bigrams:
pair_counts[pair] += 1
relatedness_matrix = np.zeros((len(targets), len(targets)))
for row in range(0, len(targets) - 1):
for col in range(row + 1, len(targets)):
t1 = targets[row]
t2 = targets[col]
score = pair_counts[(t1, t2)] / (counts[t1] * counts[t2])
relatedness_matrix[row][col] = score
for col in range(0, len(targets) - 1):
for row in range(col + 1, len(targets)):
relatedness_matrix[row][col] = relatedness_matrix[col][row]
relatedness_matrix = np.divide(relatedness_matrix, np.amax(relatedness_matrix))
return relatedness_matrix
def get_related_nouns(self, n):
nouns = [] nouns = []
for phrase in tokenized_normalized: for phrase in self.phrases:
pos_tags = pos_tag(phrase) pos_tags = pos_tag(phrase)
bigrams = bigrammer[phrase] bigrams = self.bigrammer[phrase]
word_idx = 0 word_idx = 0
for token in bigrams: for token in bigrams:
if '_' in token: if '_' in token:
...@@ -47,12 +88,59 @@ class TargetExtractor: ...@@ -47,12 +88,59 @@ class TargetExtractor:
nouns.append(token) nouns.append(token)
word_idx += len(words) word_idx += len(words)
else: else:
if TargetExtractor.is_noun(pos_tags[word_idx]): if len(token) > 1 and TargetExtractor.is_noun(pos_tags[word_idx]):
nouns.append(token) nouns.append(token)
word_idx += 1 word_idx += 1
c = Counter(nouns) c = Counter(nouns)
return c.most_common(n) common = c.most_common(n)
# filter terms not related to product
common = [(term, count) for term, count in common if
cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS]
# bring product to front of list
targets = [target for target, _ in common]
if self.product in targets:
targets.remove(self.product)
targets.insert(0, self.product)
return targets, {target: count for target, count in common}
@staticmethod
def spanning_tree_from_root(vertices, weights, root_idx=0):
root = Node(vertices[root_idx])
for idx in np.flip(np.argsort(weights[root_idx])):
if idx == root_idx:
continue
gain = weights[root_idx][idx]
parent = root
for branch_node in root.descendants:
min_scaled_weight = min(weights[n.idx][idx] * pow(TargetExtractor.DEPTH_COST, branch_node.depth)
for n in (branch_node,) + branch_node.anchestors if n != root)
if min_scaled_weight > gain:
gain = min_scaled_weight
parent = branch_node
node = Node(vertices[idx], parent=parent)
node.idx = idx
return root
@staticmethod
def kruskal(vertices, edges):
result = set()
groups = {vertex: i for i, vertex in enumerate(vertices)}
for u, v in sorted(edges, key=edges.get, reverse=True):
if groups[u] != groups[v]:
result.add((u, v))
TargetExtractor.join_groups(groups, groups[u], groups[v])
return result
@staticmethod
def join_groups(groups, i, j):
for v in groups:
if groups[v] == j:
groups[v] = i
@staticmethod @staticmethod
def singular(word): def singular(word):
...@@ -67,8 +155,43 @@ class TargetExtractor: ...@@ -67,8 +155,43 @@ class TargetExtractor:
return [item for _, items in self.metadata[col].items() if not pd.isnull(items) return [item for _, items in self.metadata[col].items() if not pd.isnull(items)
for item in ast.literal_eval(items)] for item in ast.literal_eval(items)]
def extract(self):
pass
class Targets:
def __init__(self, targets):
self.targets = targets
self.groups = {i: {target} for i, target in enumerate(targets)}
self.next_idx = len(targets)
def get(self):
return self.targets
def set_syns(self, syns):
syn_set = {syn for syn in syns if syn in self.targets}
if not self.is_group(syn_set):
i = self.next_idx
self.next_idx += 1
self.clear_subgroups(syn_set)
self.groups[i] = syn_set
def is_group(self, syns):
return any(syns in group_syns for group_syns in self.groups.values())
def clear_subgroups(self, syns):
self.groups = {group: group_syns for group, group_syns in self.groups.items() if not group_syns.issubset(syns)}
def get_groups(self):
return [syns for group, syns in self.groups.items()]
def get_syns(word):
syns = {word}
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
syns.add(lemma.name())
return syns
extractor = TargetExtractor('camera', 'data/camera_metadata.tsv')
extractor.get_tree()
TargetExtractor('data/camera_metadata.tsv')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment