Commit 4f0969e9 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented synonym dict for TargetEExtractor

parent 4ff3515e
......@@ -3,7 +3,7 @@ import ast
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.corpus import stopwords, wordnet, wordnet_ic
from nltk.stem import WordNetLemmatizer
import string
from gensim.models.phrases import Phrases, Phraser
......@@ -38,16 +38,26 @@ class TargetExtractor:
tokenized_phrases = Phrases(self.phrases)
self.bigrammer = Phraser(tokenized_phrases)
def get_tree(self):
# mine targets
targets, counts = self.get_related_nouns(50)
# mine aspects
aspects, counts = self.get_related_nouns(30)
# obtain synonyms
synset = Synset(aspects)
self.syn_dict = synset.get_dict(counts)
# extract relationships between aspects
relatedness_matrix = self.get_relations(aspects, counts)
# extract relationships between targets
relatedness_matrix = self.get_relations(targets, counts)
self.tree = TargetExtractor.spanning_tree_from_root(aspects, relatedness_matrix)
tree = TargetExtractor.spanning_tree_from_root(targets, relatedness_matrix)
return tree
def get_tree(self):
return self.tree
def get_synonyms(self):
return self.syn_dict
def get_relations(self, targets, counts):
pair_counts = {pair: 0 for pair in itertools.combinations(targets, 2)}
......@@ -156,42 +166,68 @@ class TargetExtractor:
for item in ast.literal_eval(items)]
class Targets:
def __init__(self, targets):
self.targets = targets
self.groups = {i: {target} for i, target in enumerate(targets)}
self.next_idx = len(targets)
class Synset:
def get(self):
return self.targets
def __init__(self, aspects):
self.vocab = aspects
self.syn_pairs = {frozenset((aspect, syn)) for aspect in aspects
for syn in self.get_syns(aspect) if aspect != syn}
def set_syns(self, syns):
syn_set = {syn for syn in syns if syn in self.targets}
if not self.is_group(syn_set):
i = self.next_idx
self.next_idx += 1
self.groups[i] = syn_set
def get_syns(self, word):
syns = set()
for syn in wordnet.synsets(word, pos=wordnet.NOUN):
for lemma in syn.lemmas():
syns = {syn for syn in syns if syn in self.vocab and cnet.get_relatedness(syn, word) > 0.5}
return syns
def is_group(self, syns):
return any(syns in group_syns for group_syns in self.groups.values())
def clear_subgroups(self, syns):
self.groups = {group: group_syns for group, group_syns in self.groups.items() if not group_syns.issubset(syns)}
def get_dict(self, counts):
groups = self.get_groups()
return {max(group, key=counts.get): group for group in groups}
def get_groups(self):
return [syns for group, syns in self.groups.items()]
groups = []
for w1, w2 in self.syn_pairs:
if not Synset.join_groups(w1, w2, groups):
groups.append({w1, w2})
for word in self.vocab:
if not Synset.group_for(word, groups):
return groups
# {a, b} and {b, c} become {a, b, c}
def join_groups(w1, w2, groups):
g1 = Synset.group_for(w1, groups)
g2 = Synset.group_for(w2, groups)
if g1:
if g2:
g1 = g1 if g1 else {w1}
g2 = g2 if g2 else {w2}
return True
# {a, b} and {b, c} are separate groups unless {a, c}
def join_identical_groups(w1, w2, groups):
for g1 in [group for group in groups if w1 in group]:
for g2 in [group for group in groups if w2 in group]:
if g1 - {w1} == g2 - {w2}:
return True
return False
def get_syns(word):
syns = {word}
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
return syns
def group_for(w, groups):
for group in groups:
if w in group:
return group
return None
extractor = TargetExtractor('camera', 'data/camera_metadata.tsv')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment