Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
individual_project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Joel Oksanen
individual_project
Commits
8c3b320d
Commit
8c3b320d
authored
4 years ago
by
Joel Oksanen
Browse files
Options
Downloads
Patches
Plain Diff
Implemented word2vec in detecting synonyms
parent
7d9fd221
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
ADA/server/agent/prep_metadata.py
+19
-9
19 additions, 9 deletions
ADA/server/agent/prep_metadata.py
ADA/server/agent/target_extraction/target_extractor.py
+44
-32
44 additions, 32 deletions
ADA/server/agent/target_extraction/target_extractor.py
with
63 additions
and
41 deletions
ADA/server/agent/prep_metadata.py
+
19
−
9
View file @
8c3b320d
...
...
@@ -2,7 +2,7 @@ import pandas as pd
import
gzip
import
json
MAX_ITEMS
=
15
0000
MAX_ITEMS
=
20
0000
def
parse
(
path
):
...
...
@@ -17,19 +17,30 @@ def get_df(path):
for
d
in
parse
(
path
):
df
[
i
]
=
d
i
+=
1
if
i
==
1000000
:
if
i
==
MAX_ITEMS
:
break
return
pd
.
DataFrame
.
from_dict
(
df
,
orient
=
'
index
'
)
metadata
=
get_df
(
'
amazon_data/Electronics.json.gz
'
)
output_location
=
'
target_extraction/data/electronics_reviews.tsv
'
child_product
=
'
speaker
'
reviews
=
pd
.
read_csv
(
'
amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz
'
,
sep
=
'
\t
'
,
error_bad_lines
=
False
,
compression
=
'
gzip
'
)
parent_output
=
'
target_extraction/data/electronics_reviews.tsv
'
child_output
=
'
target_extraction/data/
'
+
child_product
+
'
_reviews.tsv
'
for
col
in
metadata
.
columns
:
for
col
in
reviews
.
columns
:
print
(
col
)
metadata
=
metadata
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
metadata
=
metadata
.
head
(
MAX_ITEMS
)
c_reviews
=
reviews
[
reviews
[
'
product_title
'
].
str
.
contains
(
child_product
,
case
=
False
,
na
=
False
)]
p_reviews
=
reviews
[
~
reviews
[
'
product_title
'
].
str
.
contains
(
child_product
,
case
=
False
,
na
=
False
)]
c_reviews
=
c_reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
c_reviews
=
c_reviews
.
head
(
MAX_ITEMS
)
p_reviews
=
p_reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
p_reviews
=
p_reviews
.
head
(
MAX_ITEMS
)
p_reviews
.
to_csv
(
parent_output
,
sep
=
'
\t
'
,
index
=
False
)
c_reviews
.
to_csv
(
child_output
,
sep
=
'
\t
'
,
index
=
False
)
print
(
'
Successfully prepared data for
'
,
len
(
p_reviews
.
index
),
'
parent and
'
,
len
(
c_reviews
.
index
),
'
child reviews
'
)
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
...
...
@@ -55,5 +66,4 @@ metadata = metadata.head(MAX_ITEMS)
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
metadata
.
to_csv
(
output_location
,
sep
=
'
\t
'
,
index
=
False
)
print
(
'
Successfully prepared data for
'
,
len
(
metadata
.
index
),
'
reviews
'
)
This diff is collapsed.
Click to expand it.
ADA/server/agent/target_extraction/target_extractor.py
+
44
−
32
View file @
8c3b320d
...
...
@@ -12,6 +12,7 @@ from anytree import Node, RenderTree
import
itertools
import
numpy
as
np
import
re
from
gensim.models
import
Word2Vec
stop_words
=
stopwords
.
words
(
'
english
'
)
wnl
=
WordNetLemmatizer
()
...
...
@@ -29,7 +30,11 @@ class TargetExtractor:
N_ASPECTS
=
30
MIN_DIRECT_GAIN
=
0.1
DEPTH_COST
=
0.3
FREQ_OVER_PARENT
=
2
# target must appear x times more frequently than in parent
FREQ_OVER_PARENT
=
3
# target must appear x times more frequently than in parent
# word2vec
MIN_SIMILARITY
=
0
SYNONYM_SIMILARITY
=
0.7
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def
__init__
(
self
,
product
,
texts
,
parent
=
None
):
...
...
@@ -37,7 +42,7 @@ class TargetExtractor:
self
.
parent
=
parent
# tokenize and normalize phrases
self
.
phrases
=
[[
TargetExtractor
.
singular
(
w
.
lower
()
)
for
w
in
word_tokenize
(
phrase
.
replace
(
'
_
'
,
'
'
))]
self
.
phrases
=
[[
w
.
lower
()
for
w
in
word_tokenize
(
phrase
.
replace
(
'
_
'
,
'
'
))]
for
phrase
in
texts
]
# train bigram map
...
...
@@ -47,16 +52,20 @@ class TargetExtractor:
# count terms
self
.
counter
=
self
.
count_nouns
()
self
.
total_count
=
sum
(
self
.
counter
.
values
())
print
(
parent
,
self
.
total_count
)
def
get_tree_and_synonyms
(
self
):
# train word2vec model
wv
=
self
.
get_word2vec_model
()
# mine aspects
aspects
,
counts
=
self
.
get_related_nouns
(
self
.
counter
)
aspects
,
counts
=
self
.
get_related_nouns
(
self
.
counter
,
wv
)
print
(
aspects
)
# obtain synonyms
synset
=
Synset
(
aspects
)
syn_pairs
=
self
.
get_syn_pairs
(
aspects
,
wv
)
print
(
syn_pairs
)
synset
=
Synset
(
aspects
,
syn_pairs
)
syn_dict
=
synset
.
get_dict
(
counts
)
# remove aspect synonyms
...
...
@@ -110,27 +119,25 @@ class TargetExtractor:
if
'
_
'
in
token
:
words
=
token
.
split
(
'
_
'
)
if
any
(
TargetExtractor
.
is_noun
(
pos_tags
[
i
])
for
i
in
range
(
word_idx
,
word_idx
+
len
(
words
))):
nouns
.
append
(
token
)
nouns
.
append
(
TargetExtractor
.
singular
(
token
)
)
word_idx
+=
len
(
words
)
else
:
if
len
(
token
)
>
1
and
TargetExtractor
.
is_noun
(
pos_tags
[
word_idx
]):
nouns
.
append
(
token
)
nouns
.
append
(
TargetExtractor
.
singular
(
token
)
)
word_idx
+=
1
return
Counter
(
nouns
)
def
get_related_nouns
(
self
,
counter
):
def
get_related_nouns
(
self
,
counter
,
wv
):
common
=
counter
.
most_common
()
term_counts
=
[]
while
len
(
term_counts
)
<
TargetExtractor
.
N_ASPECTS
:
term
,
count
=
common
.
pop
(
0
)
print
(
term
)
print
(
term
,
count
)
# filter terms not related to the product
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
if
(
not
self
.
parent
or
self
.
parent
.
frequency_for_term
(
term
)
==
0
or
self
.
frequency_for_term
(
term
)
/
self
.
parent
.
frequency_for_term
(
term
)
>
TargetExtractor
.
FREQ_OVER_PARENT
):
if
self
.
is_related_to_product
(
term
,
wv
):
term_counts
.
append
((
term
,
count
))
terms
=
[
term
for
term
,
count
in
term_counts
]
...
...
@@ -143,9 +150,23 @@ class TargetExtractor:
return
terms
,
{
term
:
count
for
term
,
count
in
term_counts
}
def
is_related_to_product
(
self
,
term
,
wv
):
return
(
term
in
wv
.
vocab
and
wv
.
similarity
(
self
.
product
,
term
)
>
TargetExtractor
.
MIN_SIMILARITY
and
(
not
self
.
parent
or
self
.
parent
.
frequency_for_term
(
term
)
==
0
or
self
.
frequency_for_term
(
term
)
/
self
.
parent
.
frequency_for_term
(
term
)
>
TargetExtractor
.
FREQ_OVER_PARENT
))
@staticmethod
def
get_syn_pairs
(
terms
,
model
):
return
{
frozenset
((
t1
,
t2
))
for
t1
in
terms
for
t2
in
terms
if
t1
!=
t2
and
model
.
similarity
(
t1
,
t2
)
>
TargetExtractor
.
SYNONYM_SIMILARITY
}
def
frequency_for_term
(
self
,
term
):
return
self
.
counter
[
term
]
/
self
.
total_count
def
get_word2vec_model
(
self
):
return
Word2Vec
(
self
.
phrases
,
min_count
=
5
).
wv
@staticmethod
def
wordnet_relatedness
(
t1
,
t2
):
fst
=
wordnet
.
synset
(
t1
+
'
.n.01
'
)
...
...
@@ -199,18 +220,9 @@ class TargetExtractor:
class
Synset
:
def
__init__
(
self
,
aspects
):
def
__init__
(
self
,
aspects
,
syn_pairs
):
self
.
vocab
=
aspects
self
.
syn_pairs
=
{
frozenset
((
aspect
,
syn
))
for
aspect
in
aspects
for
syn
in
self
.
get_syns
(
aspect
)
if
aspect
!=
syn
}
def
get_syns
(
self
,
word
):
syns
=
set
()
for
syn
in
wordnet
.
synsets
(
word
,
pos
=
wordnet
.
NOUN
):
for
lemma
in
syn
.
lemmas
():
syns
.
add
(
lemma
.
name
())
syns
=
{
syn
for
syn
in
syns
if
syn
in
self
.
vocab
and
cnet
.
get_relatedness
(
syn
,
word
)
>
0.5
}
return
syns
self
.
syn_pairs
=
syn_pairs
def
get_dict
(
self
,
counts
):
groups
=
self
.
get_groups
()
...
...
@@ -231,6 +243,8 @@ class Synset:
def
join_groups
(
w1
,
w2
,
groups
):
g1
=
Synset
.
group_for
(
w1
,
groups
)
g2
=
Synset
.
group_for
(
w2
,
groups
)
if
g1
and
g2
and
g1
==
g2
:
return
True
if
g1
:
groups
.
remove
(
g1
)
if
g2
:
...
...
@@ -260,13 +274,11 @@ class Synset:
return
None
electronics_texts
=
obtain_texts
(
'
data/electronics_reviews.tsv
'
,
'
reviewText
'
)
print
(
1
)
electronics_extractor
=
TargetExtractor
(
'
device
'
,
electronics_texts
)
print
(
2
)
camera_texts
=
obtain_texts
(
'
data/camera_metadata.tsv
'
,
'
feature
'
)
print
(
3
)
camera_extractor
=
TargetExtractor
(
'
camera
'
,
camera_texts
,
parent
=
electronics_extractor
)
tree
,
synonyms
=
camera_extractor
.
get_tree_and_synonyms
()
laptop_texts
=
obtain_texts
(
'
data/laptop_reviews.tsv
'
,
'
review_body
'
)
laptop_extractor
=
TargetExtractor
(
'
laptop
'
,
laptop_texts
)
camera_texts
=
obtain_texts
(
'
data/camera_prepared_data.tsv
'
,
'
review_body
'
)
camera_extractor
=
TargetExtractor
(
'
camera
'
,
camera_texts
,
parent
=
laptop_extractor
)
tree
,
syns
=
camera_extractor
.
get_tree_and_synonyms
()
print
(
RenderTree
(
tree
))
print
(
synonyms
)
print
(
syns
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment