Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
4f0969e9
Commit
4f0969e9
authored
Apr 29, 2020
by
Joel Oksanen
Browse files
Implemented synonym dict for TargetEExtractor
parent
4ff3515e
Changes
1
Hide whitespace changes
Inline
Side-by-side
ADA/server/agent/target_extraction/target_extractor.py
View file @
4f0969e9
...
...
@@ -3,7 +3,7 @@ import ast
from
collections
import
Counter
from
nltk
import
pos_tag
from
nltk.tokenize
import
word_tokenize
from
nltk.corpus
import
stopwords
,
wordnet
from
nltk.corpus
import
stopwords
,
wordnet
,
wordnet_ic
from
nltk.stem
import
WordNetLemmatizer
import
string
from
gensim.models.phrases
import
Phrases
,
Phraser
...
...
@@ -38,16 +38,26 @@ class TargetExtractor:
tokenized_phrases
=
Phrases
(
self
.
phrases
)
self
.
bigrammer
=
Phraser
(
tokenized_phrases
)
def
get_tree
(
self
):
# mine targets
targets
,
counts
=
self
.
get_related_nouns
(
50
)
# mine aspects
aspects
,
counts
=
self
.
get_related_nouns
(
30
)
print
(
aspects
)
# obtain synonyms
synset
=
Synset
(
aspects
)
self
.
syn_dict
=
synset
.
get_dict
(
counts
)
print
(
self
.
syn_dict
)
# extract relationships between aspects
relatedness_matrix
=
self
.
get_relations
(
aspects
,
counts
)
# extract relationships between targets
relatedness_matrix
=
self
.
get_relations
(
targets
,
counts
)
self
.
tree
=
TargetExtractor
.
spanning_tree_from_root
(
aspects
,
relatedness_matrix
)
print
(
RenderTree
(
self
.
tree
)
)
tree
=
TargetExtractor
.
spanning_tree_from_root
(
targets
,
relatedness_matrix
)
print
(
RenderTree
(
tree
))
return
tree
def
get_tree
(
self
):
return
self
.
tree
def
get_synonyms
(
self
):
return
self
.
syn_dict
def
get_relations
(
self
,
targets
,
counts
):
pair_counts
=
{
pair
:
0
for
pair
in
itertools
.
combinations
(
targets
,
2
)}
...
...
@@ -156,42 +166,68 @@ class TargetExtractor:
for
item
in
ast
.
literal_eval
(
items
)]
class
Targets
:
def
__init__
(
self
,
targets
):
self
.
targets
=
targets
self
.
groups
=
{
i
:
{
target
}
for
i
,
target
in
enumerate
(
targets
)}
self
.
next_idx
=
len
(
targets
)
class
Synset
:
def
get
(
self
):
return
self
.
targets
def
__init__
(
self
,
aspects
):
self
.
vocab
=
aspects
self
.
syn_pairs
=
{
frozenset
((
aspect
,
syn
))
for
aspect
in
aspects
for
syn
in
self
.
get_syns
(
aspect
)
if
aspect
!=
syn
}
def
s
et_syns
(
self
,
syns
):
syn
_set
=
{
syn
for
syn
in
syns
if
syn
in
self
.
targets
}
if
not
self
.
is_group
(
syn_set
):
i
=
self
.
next_idx
self
.
next_idx
+=
1
self
.
clear_subgroups
(
syn_set
)
self
.
groups
[
i
]
=
syn_set
def
g
et_syns
(
self
,
word
):
syn
s
=
set
()
for
syn
in
wordnet
.
synsets
(
word
,
pos
=
wordnet
.
NOUN
):
for
lemma
in
syn
.
lemmas
():
syns
.
add
(
lemma
.
name
())
syns
=
{
syn
for
syn
in
syns
if
syn
in
self
.
vocab
and
cnet
.
get_relatedness
(
syn
,
word
)
>
0.5
}
return
syns
def
is_group
(
self
,
syns
):
return
any
(
syns
in
group_syns
for
group_syns
in
self
.
groups
.
values
())
def
clear_subgroups
(
self
,
syns
):
self
.
groups
=
{
group
:
group_syns
for
group
,
group_syns
in
self
.
groups
.
items
()
if
not
group_syns
.
issubset
(
syns
)}
def
get_dict
(
self
,
counts
):
groups
=
self
.
get_groups
()
return
{
max
(
group
,
key
=
counts
.
get
):
group
for
group
in
groups
}
def
get_groups
(
self
):
return
[
syns
for
group
,
syns
in
self
.
groups
.
items
()]
groups
=
[]
for
w1
,
w2
in
self
.
syn_pairs
:
if
not
Synset
.
join_groups
(
w1
,
w2
,
groups
):
groups
.
append
({
w1
,
w2
})
for
word
in
self
.
vocab
:
if
not
Synset
.
group_for
(
word
,
groups
):
groups
.
append
({
word
})
return
groups
# {a, b} and {b, c} become {a, b, c}
@
staticmethod
def
join_groups
(
w1
,
w2
,
groups
):
g1
=
Synset
.
group_for
(
w1
,
groups
)
g2
=
Synset
.
group_for
(
w2
,
groups
)
if
g1
:
groups
.
remove
(
g1
)
if
g2
:
groups
.
remove
(
g2
)
g1
=
g1
if
g1
else
{
w1
}
g2
=
g2
if
g2
else
{
w2
}
groups
.
append
(
g1
.
union
(
g2
))
return
True
# {a, b} and {b, c} are separate groups unless {a, c}
@
staticmethod
def
join_identical_groups
(
w1
,
w2
,
groups
):
for
g1
in
[
group
for
group
in
groups
if
w1
in
group
]:
for
g2
in
[
group
for
group
in
groups
if
w2
in
group
]:
if
g1
-
{
w1
}
==
g2
-
{
w2
}:
groups
.
remove
(
g1
)
groups
.
remove
(
g2
)
groups
.
append
(
g1
.
union
(
g2
))
return
True
return
False
def
get_syns
(
word
):
syns
=
{
word
}
for
syn
in
wordnet
.
synsets
(
word
)
:
for
lemma
in
syn
.
lemmas
()
:
syns
.
add
(
lemma
.
name
())
return
syns
@
staticmethod
def
group_for
(
w
,
groups
):
for
group
in
groups
:
if
w
in
group
:
return
group
return
None
extractor
=
TargetExtractor
(
'camera'
,
'data/camera_metadata.tsv'
)
extractor
.
get_tree
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment