Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
a2ecd08d
Commit
a2ecd08d
authored
May 26, 2020
by
Joel Oksanen
Browse files
Fixed some synonym bugs in target extractor
parent
34e90902
Changes
3
Hide whitespace changes
Inline
Side-by-side
ADA/server/agent/prep_metadata.py
View file @
a2ecd08d
...
...
@@ -24,20 +24,20 @@ def get_df(path):
pd
.
set_option
(
'display.max_colwidth'
,
None
)
category
=
'
Acoustic Guitar
s'
category
=
'
Cardigan
s'
metadata_iter
=
pd
.
read_json
(
'amazon_data/meta_
Musical_Instruments
.json'
,
lines
=
True
,
chunksize
=
1000
)
metadata_iter
=
pd
.
read_json
(
'amazon_data/meta_
Clothing_Shoes_and_Jewelry
.json'
,
lines
=
True
,
chunksize
=
1000
)
metadata
=
pd
.
concat
([
metadata
[
metadata
[
'category'
].
apply
(
lambda
cl
:
type
(
cl
)
is
list
and
category
in
cl
)]
for
metadata
in
metadata_iter
])
print
(
len
(
metadata
.
index
))
review_iter
=
pd
.
read_json
(
'amazon_data/
Musical_Instruments
.json'
,
lines
=
True
,
chunksize
=
1000
)
review_iter
=
pd
.
read_json
(
'amazon_data/
Clothing_Shoes_and_Jewelry
.json'
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'asin'
].
isin
(
metadata
[
'asin'
])]
for
reviews
in
review_iter
])
print
(
len
(
reviews
.
index
))
reviews
.
to_csv
(
'target_extraction/data/verified_
acoustic_guitar
_reviews.tsv'
,
sep
=
'
\t
'
,
index
=
False
)
reviews
.
to_csv
(
'target_extraction/data/verified_
cardigan
_reviews.tsv'
,
sep
=
'
\t
'
,
index
=
False
)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
...
...
ADA/server/agent/target_extraction/BERT/relation_extractor/bert_rel_extractor.py
View file @
a2ecd08d
...
...
@@ -234,8 +234,6 @@ class BertRelExtractor:
count_matrix
[
snd_idx
][
fst_idx
]
+=
1
count_matrix
[
fst_idx
][
snd_idx
]
+=
1
prob_matrix
=
(
prob_matrix
.
T
/
aspect_counts
).
T
# scale rows by aspect counts
return
prob_matrix
return
prob_matrix
,
count_matrix
ADA/server/agent/target_extraction/target_extractor.py
View file @
a2ecd08d
...
...
@@ -39,7 +39,8 @@ class TargetExtractor:
# word2vec
MIN_TERM_COUNT
=
100
SYNONYM_SIMILARITY
=
0.10
SYNONYM_SIMILARITY
=
0.12
SYNONYM_SIMILARITY_PRODUCT
=
0.09
WV_SIZE
=
100
WV_WINDOW
=
7
...
...
@@ -82,18 +83,20 @@ class TargetExtractor:
print
(
'mining aspects...'
)
# mine aspects
aspects
,
counts
=
self
.
get_aspects
(
self
.
counter
)
self
.
aspects
,
self
.
counts
=
self
.
get_aspects
(
self
.
counter
)
print
(
'extracting synonyms...'
)
# obtain synonyms
syn_pairs
=
self
.
get_syn_pairs
(
aspects
,
self
.
wv
)
synset
=
Synset
(
aspects
,
syn_pairs
,
self
.
product
)
self
.
syn_dict
=
synset
.
get_dict
(
counts
)
syn_pairs
=
self
.
get_syn_pairs
()
synset
=
Synset
(
self
.
aspects
,
syn_pairs
,
self
.
product
)
self
.
syn_dict
=
synset
.
get_dict
(
self
.
counts
)
# remove aspect synonyms and reorder list based on sum of all synonym counts
aspects
=
[
aspect
for
aspect
in
aspects
if
aspect
in
self
.
syn_dict
.
keys
()]
self
.
counts
=
{
aspect
:
sum
(
counts
[
syn
]
for
syn
in
self
.
syn_dict
[
aspect
])
for
aspect
in
aspects
}
self
.
aspects
=
sorted
(
aspects
,
key
=
self
.
counts
.
get
,
reverse
=
True
)
self
.
aspects
=
[
aspect
for
aspect
in
self
.
aspects
if
aspect
in
self
.
syn_dict
.
keys
()]
self
.
counts
=
{
aspect
:
sum
(
self
.
counts
[
syn
]
for
syn
in
self
.
syn_dict
[
aspect
])
for
aspect
in
self
.
aspects
}
self
.
aspects
=
sorted
(
self
.
aspects
,
key
=
self
.
counts
.
get
,
reverse
=
True
)
print
(
self
.
syn_dict
)
self
.
save
()
...
...
@@ -115,15 +118,6 @@ class TargetExtractor:
print
(
self
.
relatedness_matrix
)
print
(
RenderTree
(
self
.
tree
))
def
extract_relatedness_matrix
(
self
):
print
(
'extracting relatedness matrix...'
)
# extract relationships between aspects
self
.
relatedness_matrix
=
self
.
get_bert_relations
()
print
(
self
.
aspects
)
print
(
self
.
syn_dict
)
print
(
self
.
relatedness_matrix
)
def
save_product_representation
(
self
):
f
=
open
(
'extracted_products/'
+
self
.
product
+
Product
.
FILE_EXTENSION
,
'wb'
)
p
=
Product
(
self
.
tree
,
self
.
syn_dict
)
...
...
@@ -174,10 +168,29 @@ class TargetExtractor:
dataset
=
PairRelDataset
.
from_df
(
df
,
size
=
TargetExtractor
.
MAX_BERT_DATASET_SIZE
)
bert_extractor
=
BertRelExtractor
.
load_saved
(
rel_extractor_path
)
aspect_counts
=
np
.
array
([
self
.
counts
[
aspect
]
for
aspect
in
self
.
aspects
])
relatedness_matrix
=
bert_extractor
.
extract_relations
(
len
(
self
.
aspects
),
self
.
aspect_index_map
(),
aspect_counts
,
dataset
=
dataset
)
prob_matrix
,
count_matrix
=
bert_extractor
.
extract_relations
(
len
(
self
.
aspects
),
self
.
aspect_index_map
(),
aspect_counts
,
dataset
=
dataset
)
self
.
relatedness_matrix
=
(
prob_matrix
.
T
/
aspect_counts
).
T
# scale rows by aspect counts
return
self
.
relatedness_matrix
def
extract_synset
(
self
):
for
idx
,
aspect
in
enumerate
(
self
.
aspects
):
if
idx
==
0
:
continue
synset
=
{
idx
}
aspect_dependence
=
self
.
aspect_dependence
(
idx
)
for
syn_idx
in
self
.
get_syns
(
aspect
):
if
syn_idx
<
idx
and
syn_idx
!=
aspect_dependence
:
synset
.
add
(
syn_idx
)
self
.
print_relations_from
(
aspect
)
if
len
(
synset
)
>
1
:
return
synset
return
None
return
relatedness_matrix
def
get_syns
(
self
,
aspect
):
return
{
idx
for
idx
,
a
in
enumerate
(
self
.
aspects
)
if
a
!=
aspect
and
self
.
wv
.
relative_cosine_similarity
(
a
,
aspect
)
>
TargetExtractor
.
SYNONYM_SIMILARITY
}
def
aspect_index_map
(
self
):
return
{
syn
:
idx
for
idx
,
aspect
in
enumerate
(
self
.
aspects
)
for
syn
in
self
.
syn_dict
[
aspect
]}
...
...
@@ -291,10 +304,13 @@ class TargetExtractor:
return
None
@
staticmethod
def
get_syn_pairs
(
terms
,
model
):
return
{
frozenset
((
t1
,
t2
))
for
t1
in
terms
for
t2
in
terms
if
t1
!=
t2
and
model
.
relative_cosine_similarity
(
t1
,
t2
)
>
TargetExtractor
.
SYNONYM_SIMILARITY
}
def
get_syn_pairs
(
self
):
return
{
frozenset
((
t1
,
t2
))
for
t1
in
self
.
aspects
for
t2
in
self
.
aspects
if
t1
!=
t2
and
(
wnl
.
lemmatize
(
t1
)
==
wnl
.
lemmatize
(
t2
)
or
self
.
wv
.
relative_cosine_similarity
(
t1
,
t2
)
>
(
TargetExtractor
.
SYNONYM_SIMILARITY_PRODUCT
if
(
t1
==
self
.
product
or
t2
==
self
.
product
)
else
TargetExtractor
.
SYNONYM_SIMILARITY
))}
def
get_word2vec_model
(
self
,
size
,
window
,
min_count
):
model
=
Word2Vec
(
self
.
ngrams
(
self
.
phrases
),
size
=
size
,
window
=
window
,
min_count
=
min_count
).
wv
...
...
@@ -312,15 +328,25 @@ class TargetExtractor:
f
.
close
()
return
extractor
def
closest_relative_for_idx
(
self
,
idx
):
return
np
.
argmax
(
self
.
relatedness_matrix
[
idx
])
def
aspect_dependence
(
self
,
idx
):
row
=
self
.
relatedness_matrix
[
idx
]
max_idx1
,
max_idx2
=
row
[
1
:].
argsort
()[
-
2
:][::
-
1
]
+
1
if
max_idx1
<
idx
and
row
[
max_idx1
]
>=
row
[
max_idx2
]
*
TargetExtractor
.
SUBFEATURE_MULT
:
return
max_idx1
else
:
return
None
def
get_product_tree
(
self
):
root
=
Node
(
self
.
aspects
[
0
])
root
.
idx
=
0
for
idx
in
range
(
1
,
len
(
self
.
aspects
)):
# for each feature in order from highest to lowest count
row
=
self
.
relatedness_matrix
[
idx
]
max_idx1
,
max_idx2
=
row
[
1
:].
argsort
()[
-
2
:][::
-
1
]
+
1
if
max_idx1
<
idx
and
row
[
max_idx1
]
>=
row
[
max_idx2
]
*
TargetExtractor
.
SUBFEATURE_MULT
:
parent
=
next
(
n
for
n
in
root
.
descendants
if
n
.
idx
==
max_idx1
)
dep_idx
=
self
.
aspect_dependence
(
idx
)
if
dep_idx
is
not
None
:
parent
=
next
(
n
for
n
in
root
.
descendants
if
n
.
idx
==
dep_idx
)
else
:
parent
=
root
node
=
Node
(
self
.
aspects
[
idx
],
parent
=
parent
)
...
...
@@ -343,7 +369,8 @@ class TargetExtractor:
def
print_relations_from
(
self
,
aspect
):
idx
=
self
.
aspects
.
index
(
aspect
)
rels
=
self
.
relatedness_matrix
[
idx
].
copy
()
for
rel_idx
in
sorted
(
range
(
len
(
self
.
aspects
)),
key
=
lambda
i
:
rels
[
i
],
reverse
=
True
):
print
(
' relations from {}:'
.
format
(
aspect
))
for
rel_idx
in
sorted
(
range
(
len
(
self
.
aspects
)),
key
=
lambda
i
:
rels
[
i
],
reverse
=
True
)[:
5
]:
print
(
' {:.4f}'
.
format
(
rels
[
rel_idx
]),
self
.
aspects
[
rel_idx
])
...
...
@@ -368,7 +395,6 @@ class Synset:
groups
.
append
({
word
})
return
groups
# {a, b} and {b, c} become {a, b, c}
@
staticmethod
def
join_groups
(
w1
,
w2
,
groups
):
g1
=
Synset
.
group_for
(
w1
,
groups
)
...
...
@@ -384,18 +410,6 @@ class Synset:
groups
.
append
(
g1
.
union
(
g2
))
return
True
# {a, b} and {b, c} are separate groups unless {a, c}
@
staticmethod
def
join_identical_groups
(
w1
,
w2
,
groups
):
for
g1
in
[
group
for
group
in
groups
if
w1
in
group
]:
for
g2
in
[
group
for
group
in
groups
if
w2
in
group
]:
if
g1
-
{
w1
}
==
g2
-
{
w2
}:
groups
.
remove
(
g1
)
groups
.
remove
(
g2
)
groups
.
append
(
g1
.
union
(
g2
))
return
True
return
False
@
staticmethod
def
group_for
(
w
,
groups
):
for
group
in
groups
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment