Commit 4f0969e9 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented synonym dict for TargetEExtractor

parent 4ff3515e
...@@ -3,7 +3,7 @@ import ast ...@@ -3,7 +3,7 @@ import ast
from collections import Counter from collections import Counter
from nltk import pos_tag from nltk import pos_tag
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet from nltk.corpus import stopwords, wordnet, wordnet_ic
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
import string import string
from gensim.models.phrases import Phrases, Phraser from gensim.models.phrases import Phrases, Phraser
...@@ -38,16 +38,26 @@ class TargetExtractor: ...@@ -38,16 +38,26 @@ class TargetExtractor:
tokenized_phrases = Phrases(self.phrases) tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases) self.bigrammer = Phraser(tokenized_phrases)
def get_tree(self): # mine aspects
# mine targets aspects, counts = self.get_related_nouns(30)
targets, counts = self.get_related_nouns(50) print(aspects)
# obtain synonyms
synset = Synset(aspects)
self.syn_dict = synset.get_dict(counts)
# extract relationships between aspects
relatedness_matrix = self.get_relations(aspects, counts)
# extract relationships between targets self.tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
relatedness_matrix = self.get_relations(targets, counts) print(RenderTree(self.tree))
tree = TargetExtractor.spanning_tree_from_root(targets, relatedness_matrix) def get_tree(self):
print(RenderTree(tree)) return self.tree
return tree
def get_synonyms(self):
return self.syn_dict
def get_relations(self, targets, counts): def get_relations(self, targets, counts):
pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)} pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)}
...@@ -156,42 +166,68 @@ class TargetExtractor: ...@@ -156,42 +166,68 @@ class TargetExtractor:
for item in ast.literal_eval(items)] for item in ast.literal_eval(items)]
class Targets: class Synset:
def __init__(self, targets):
self.targets = targets
self.groups = {i: {target} for i, target in enumerate(targets)}
self.next_idx = len(targets)
def get(self): def __init__(self, aspects):
return self.targets self.vocab = aspects
self.syn_pairs = {frozenset((aspect, syn)) for aspect in aspects
for syn in self.get_syns(aspect) if aspect != syn}
def set_syns(self, syns): def get_syns(self, word):
syn_set = {syn for syn in syns if syn in self.targets} syns = set()
if not self.is_group(syn_set): for syn in wordnet.synsets(word, pos=wordnet.NOUN):
i = self.next_idx for lemma in syn.lemmas():
self.next_idx += 1 syns.add(
self.clear_subgroups(syn_set) syns = {syn for syn in syns if syn in self.vocab and cnet.get_relatedness(syn, word) > 0.5}
self.groups[i] = syn_set return syns
def is_group(self, syns): def get_dict(self, counts):
return any(syns in group_syns for group_syns in self.groups.values()) groups = self.get_groups()
return {max(group, key=counts.get): group for group in groups}
def clear_subgroups(self, syns):
self.groups = {group: group_syns for group, group_syns in self.groups.items() if not group_syns.issubset(syns)}
def get_groups(self): def get_groups(self):
return [syns for group, syns in self.groups.items()] groups = []
for w1, w2 in self.syn_pairs:
if not Synset.join_groups(w1, w2, groups):
groups.append({w1, w2})
for word in self.vocab:
if not Synset.group_for(word, groups):
return groups
# {a, b} and {b, c} become {a, b, c}
def join_groups(w1, w2, groups):
g1 = Synset.group_for(w1, groups)
g2 = Synset.group_for(w2, groups)
if g1:
if g2:
g1 = g1 if g1 else {w1}
g2 = g2 if g2 else {w2}
return True
# {a, b} and {b, c} are separate groups unless {a, c}
def join_identical_groups(w1, w2, groups):
for g1 in [group for group in groups if w1 in group]:
for g2 in [group for group in groups if w2 in group]:
if g1 - {w1} == g2 - {w2}:
return True
return False
def get_syns(word): @staticmethod
syns = {word} def group_for(w, groups):
for syn in wordnet.synsets(word): for group in groups:
for lemma in syn.lemmas(): if w in group:
syns.add( return group
return syns return None
extractor = TargetExtractor('camera', 'data/camera_metadata.tsv') extractor = TargetExtractor('camera', 'data/camera_metadata.tsv')
extractor.get_tree() extractor.get_tree()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment