Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
ab2bf2ed
Commit
ab2bf2ed
authored
Apr 29, 2020
by
Joel Oksanen
Browse files
Integrated synonyms to target_extractor
parent
4f0969e9
Changes
4
Hide whitespace changes
Inline
Side-by-side
ADA/server/agent/analyze_data.py
View file @
ab2bf2ed
...
...
@@ -164,7 +164,7 @@ def get_strengths(qbaf):
#############
all_reviews
=
pd
.
read_csv
(
'
amaz
on
_
data/camera_prepared_data.tsv'
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
all_reviews
=
pd
.
read_csv
(
'
target_extracti
on
/
data/camera_prepared_data.tsv'
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
camera_strengths
=
[]
star_rating_averages
=
[]
...
...
ADA/server/agent/prep_metadata.py
View file @
ab2bf2ed
...
...
@@ -3,7 +3,6 @@ import gzip
import
json
import
re
output_location
=
'target_extraction/data/camera_metadata.tsv'
def
parse
(
path
):
g
=
gzip
.
open
(
path
,
'rb'
)
...
...
@@ -20,31 +19,35 @@ def get_df(path):
return
pd
.
DataFrame
.
from_dict
(
df
,
orient
=
'index'
)
metadata
=
get_df
(
'amazon_data/meta_Electronics.json.gz'
)
metadata
=
get_df
(
'amazon_data/meta_Musical_Instruments.json.gz'
)
output_location
=
'target_extraction/data/guitar_metadata.tsv'
for
col
in
metadata
.
columns
:
print
(
col
)
# get metadata for camera products
metadata
=
metadata
[
metadata
[
'main_cat'
]
==
'Camera & Photo'
]
# try to filter out camera accessories
filter_words
=
[
'accessor'
,
'battery'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
'book'
,
'filter'
,
'light'
,
'drive'
,
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
,
'cctv'
,
'cassette'
]
filter_pat
=
''
for
word
in
filter_words
:
word_filter
=
'['
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
']'
+
word
[
1
:]
filter_pat
+=
word_filter
+
'|'
filter_pat
=
filter_pat
[:
-
1
]
r
=
re
.
compile
(
filter_pat
)
metadata
=
metadata
[
~
metadata
[
'title'
].
str
.
contains
(
pat
=
filter_pat
,
na
=
False
,
regex
=
True
)]
metadata
=
metadata
[
~
metadata
[
'category'
].
apply
(
lambda
cats
:
any
(
r
.
search
(
cat
)
for
cat
in
cats
))]
for
_
,
row
in
metadata
.
head
(
20
).
iterrows
():
print
(
'features:'
,
row
[
'feature'
])
print
(
'description:'
,
row
[
'description'
])
print
(
'tech1:'
,
row
[
'tech1'
])
print
(
'tech2:'
,
row
[
'tech2'
])
# get metadata for sunglasses
metadata
=
metadata
[
metadata
[
'title'
].
str
.
contains
(
pat
=
'[G]uitar'
,
na
=
False
,
regex
=
True
)]
# # get metadata for camera products
# metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
#
# # try to filter out camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
# filter_pat = ''
# for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter_pat += word_filter + '|'
# filter_pat = filter_pat[:-1]
# r = re.compile(filter_pat)
# metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
# metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
#
# for _, row in metadata.head(20).iterrows():
# print('features:', row['feature'])
# print('description:', row['description'])
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
metadata
.
to_csv
(
output_location
,
sep
=
'
\t
'
,
index
=
False
)
print
(
'Successfully prepared data for'
,
len
(
metadata
.
index
),
'products'
)
ADA/server/agent/target_extraction/concept_net.py
View file @
ab2bf2ed
import
requests
import
threading
from
anytree
import
Nod
e
import
sys
import
tim
e
class
ConceptNet
:
...
...
@@ -22,6 +22,7 @@ class ConceptNet:
def
get_relatedness
(
self
,
f1
,
f2
):
uri
=
'/relatedness?node1=/c/en/{f1}&node2=/c/en/{f2}'
.
format
(
f1
=
f1
.
replace
(
' '
,
'_'
),
f2
=
f2
.
replace
(
' '
,
'_'
))
obj
=
requests
.
get
(
self
.
url
+
uri
).
json
()
time
.
sleep
(
0.5
)
# only 3600 requests allowed / hour
return
obj
[
'value'
]
def
append_result
(
self
,
feature
,
rel
,
result_set
,
lock
):
...
...
ADA/server/agent/target_extraction/target_extractor.py
View file @
ab2bf2ed
...
...
@@ -3,7 +3,7 @@ import ast
from
collections
import
Counter
from
nltk
import
pos_tag
from
nltk.tokenize
import
word_tokenize
from
nltk.corpus
import
stopwords
,
wordnet
,
wordnet_ic
from
nltk.corpus
import
stopwords
,
wordnet
from
nltk.stem
import
WordNetLemmatizer
import
string
from
gensim.models.phrases
import
Phrases
,
Phraser
...
...
@@ -11,7 +11,7 @@ from concept_net import ConceptNet
from
anytree
import
Node
,
RenderTree
import
itertools
import
numpy
as
np
from
sklearn.preprocessing
import
normaliz
e
import
r
e
stop_words
=
stopwords
.
words
(
'english'
)
wnl
=
WordNetLemmatizer
()
...
...
@@ -19,39 +19,44 @@ cnet = ConceptNet()
class
TargetExtractor
:
MIN_RELATEDNESS
=
0.1
MIN_RELATEDNESS
=
0.3
N_ASPECTS
=
50
MIN_DIRECT_GAIN
=
0.1
DEPTH_COST
=
0.3
def
__init__
(
self
,
product
,
metadata_path
):
self
.
product
=
product
self
.
metadata
=
pd
.
read_csv
(
metadata_path
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
self
.
features
=
self
.
get_all
(
'feature'
)
self
.
descriptions
=
self
.
get_all
(
'description'
)
self
.
tech1
=
self
.
get_all
(
'tech1'
)
self
.
tech2
=
self
.
get_all
(
'tech2'
)
features
=
self
.
get_all
(
'feature'
)
#
descriptions = self.get_all('description')
#
tech1 = self.get_all('tech1')
#
tech2 = self.get_all('tech2')
# tokenize and normalize phrases
self
.
phrases
=
[[
TargetExtractor
.
singular
(
w
.
lower
())
for
w
in
word_tokenize
(
phrase
.
replace
(
'_'
,
' '
))]
for
phrase
in
self
.
features
]
for
phrase
in
features
]
# train bigram map
tokenized_phrases
=
Phrases
(
self
.
phrases
)
self
.
bigrammer
=
Phraser
(
tokenized_phrases
)
# mine aspects
aspects
,
counts
=
self
.
get_related_nouns
(
30
)
print
(
aspects
)
aspects
,
counts
=
self
.
get_related_nouns
(
TargetExtractor
.
N_ASPECTS
)
# obtain synonyms
synset
=
Synset
(
aspects
)
self
.
syn_dict
=
synset
.
get_dict
(
counts
)
print
(
self
.
syn_dict
)
# remove aspect synonyms
aspects
=
[
aspect
for
aspect
in
aspects
if
aspect
in
self
.
syn_dict
.
keys
()]
counts
=
{
aspect
:
sum
(
counts
[
syn
]
for
syn
in
self
.
syn_dict
[
aspect
])
for
aspect
,
count
in
counts
.
items
()
if
aspect
in
aspects
}
# extract relationships between aspects
relatedness_matrix
=
self
.
get_relations
(
aspects
,
counts
)
# extract aspect tree
self
.
tree
=
TargetExtractor
.
spanning_tree_from_root
(
aspects
,
relatedness_matrix
)
print
(
RenderTree
(
self
.
tree
))
def
get_tree
(
self
):
return
self
.
tree
...
...
@@ -66,7 +71,8 @@ class TargetExtractor:
bigrams
=
self
.
bigrammer
[
phrase
]
for
pair
in
pair_counts
:
t1
,
t2
=
pair
if
t1
in
bigrams
and
t2
in
bigrams
:
if
(
any
(
term
in
bigrams
for
term
in
self
.
syn_dict
[
t1
])
and
any
(
term
in
bigrams
for
term
in
self
.
syn_dict
[
t2
])):
pair_counts
[
pair
]
+=
1
relatedness_matrix
=
np
.
zeros
((
len
(
targets
),
len
(
targets
)))
...
...
@@ -89,7 +95,8 @@ class TargetExtractor:
nouns
=
[]
for
phrase
in
self
.
phrases
:
pos_tags
=
pos_tag
(
phrase
)
bigrams
=
self
.
bigrammer
[
phrase
]
bigrams
=
[
re
.
sub
(
'_*'
+
self
.
product
+
'_*'
,
''
,
bigram
)
if
bigram
!=
self
.
product
else
bigram
for
bigram
in
self
.
bigrammer
[
phrase
]]
word_idx
=
0
for
token
in
bigrams
:
if
'_'
in
token
:
...
...
@@ -117,13 +124,19 @@ class TargetExtractor:
return
targets
,
{
target
:
count
for
target
,
count
in
common
}
@
staticmethod
def
wordnet_relatedness
(
t1
,
t2
):
fst
=
wordnet
.
synset
(
t1
+
'.n.01'
)
snd
=
wordnet
.
synset
(
t2
+
'.n.01'
)
return
fst
.
wup_similarity
(
snd
)
@
staticmethod
def
spanning_tree_from_root
(
vertices
,
weights
,
root_idx
=
0
):
root
=
Node
(
vertices
[
root_idx
])
for
idx
in
np
.
flip
(
np
.
argsort
(
weights
[
root_idx
])):
if
idx
==
root_idx
:
continue
gain
=
weights
[
root_idx
][
idx
]
gain
=
max
(
TargetExtractor
.
MIN_DIRECT_GAIN
,
weights
[
root_idx
][
idx
]
)
parent
=
root
for
branch_node
in
root
.
descendants
:
min_scaled_weight
=
min
(
weights
[
n
.
idx
][
idx
]
*
pow
(
TargetExtractor
.
DEPTH_COST
,
branch_node
.
depth
)
...
...
@@ -228,6 +241,4 @@ class Synset:
return
group
return
None
extractor
=
TargetExtractor
(
'camera'
,
'data/camera_metadata.tsv'
)
extractor
.
get_tree
()
print
(
TargetExtractor
(
'camera'
,
'data/camera_metadata.tsv'
).
get_tree
())
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment