Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
individual_project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Joel Oksanen
individual_project
Commits
ef388c59
Commit
ef388c59
authored
4 years ago
by
Joel Oksanen
Browse files
Options
Downloads
Patches
Plain Diff
A lot of changed to target extractor. Performs decently on cameras and laptops now.
parent
8c3b320d
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
ADA/.gitignore
+1
-0
1 addition, 0 deletions
ADA/.gitignore
ADA/server/agent/prep_metadata.py
+34
-15
34 additions, 15 deletions
ADA/server/agent/prep_metadata.py
ADA/server/agent/target_extraction/target_extractor.py
+291
-55
291 additions, 55 deletions
ADA/server/agent/target_extraction/target_extractor.py
with
326 additions
and
70 deletions
ADA/.gitignore
+
1
−
0
View file @
ef388c59
...
...
@@ -4,3 +4,4 @@ __pycache__/
server/agent/amazon_data/
server/agent/target_extraction/data/
.DS_Store
*.pickle
\ No newline at end of file
This diff is collapsed.
Click to expand it.
ADA/server/agent/prep_metadata.py
+
34
−
15
View file @
ef388c59
...
...
@@ -22,25 +22,44 @@ def get_df(path):
return
pd
.
DataFrame
.
from_dict
(
df
,
orient
=
'
index
'
)
child_product
=
'
speaker
'
reviews
=
pd
.
read_csv
(
'
amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz
'
,
sep
=
'
\t
'
,
error_bad_lines
=
False
,
compression
=
'
gzip
'
)
parent_output
=
'
target_extraction/data/electronics_reviews.tsv
'
child_output
=
'
target_extraction/data/
'
+
child_product
+
'
_reviews.tsv
'
pd
.
set_option
(
'
display.max_colwidth
'
,
None
)
for
col
in
reviews
.
columns
:
category
=
'
Laptops
'
metadata
=
pd
.
read_json
(
'
amazon_data/meta_Electronics.json
'
,
lines
=
True
)
# get_df('amazon_data/meta_Electronics.json.gz')
for
col
in
metadata
.
columns
:
print
(
col
)
c_reviews
=
reviews
[
reviews
[
'
product_title
'
].
str
.
contains
(
child_product
,
case
=
False
,
na
=
False
)]
p_reviews
=
reviews
[
~
reviews
[
'
product_title
'
].
str
.
contains
(
child_product
,
case
=
False
,
na
=
False
)]
c_reviews
=
c_reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
c_reviews
=
c_reviews
.
head
(
MAX_ITEMS
)
p_reviews
=
p_reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
p_reviews
=
p_reviews
.
head
(
MAX_ITEMS
)
metadata
=
metadata
[
metadata
[
'
category
'
].
apply
(
lambda
cats
:
category
in
cats
)]
p_reviews
.
to_csv
(
parent_output
,
sep
=
'
\t
'
,
index
=
False
)
c_reviews
.
to_csv
(
child_output
,
sep
=
'
\t
'
,
index
=
False
)
print
(
'
Successfully prepared data for
'
,
len
(
p_reviews
.
index
),
'
parent and
'
,
len
(
c_reviews
.
index
),
'
child reviews
'
)
print
(
metadata
[
'
category
'
][:
5
])
print
(
len
(
metadata
.
index
))
review_iter
=
pd
.
read_json
(
'
amazon_data/Electronics.json
'
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'
asin
'
].
isin
(
metadata
[
'
asin
'
])]
for
reviews
in
review_iter
])
print
(
len
(
reviews
.
index
))
reviews
.
to_csv
(
'
target_extraction/data/verified_laptop_reviews.tsv
'
,
sep
=
'
\t
'
,
index
=
False
)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
# compression='gzip')
# parent_output = 'target_extraction/data/electronics_reviews.tsv'
# child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
#
# for col in reviews.columns:
# print(col)
#
# c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
# p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
# c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
# c_reviews = c_reviews.head(MAX_ITEMS)
# p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
# p_reviews = p_reviews.head(MAX_ITEMS)
#
# p_reviews.to_csv(parent_output, sep='\t', index=False)
# c_reviews.to_csv(child_output, sep='\t', index=False)
# print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
...
...
This diff is collapsed.
Click to expand it.
ADA/server/agent/target_extraction/target_extractor.py
+
291
−
55
View file @
ef388c59
...
...
@@ -2,110 +2,169 @@ import pandas as pd
import
ast
from
collections
import
Counter
from
nltk
import
pos_tag
from
nltk.tokenize
import
word_tokenize
from
nltk.tokenize
import
word_tokenize
,
sent_tokenize
from
nltk.corpus
import
stopwords
,
wordnet
from
nltk.stem
import
WordNetLemmatizer
import
string
from
gensim.models.phrases
import
Phrases
,
Phraser
from
concept_net
import
ConceptNet
from
anytree
import
Node
,
RenderTree
import
itertools
import
numpy
as
np
import
re
from
gensim.models
import
Word2Vec
from
gensim.models
import
Word2Vec
,
KeyedVectors
import
pickle
import
math
stop_words
=
stopwords
.
words
(
'
english
'
)
wnl
=
WordNetLemmatizer
()
cnet
=
ConceptNet
()
def
obtain_texts
(
path
,
col
):
file
=
pd
.
read_csv
(
path
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
return
[
text
for
_
,
text
in
file
[
col
].
items
()
if
not
pd
.
isnull
(
text
)]
#for text in ast.literal_eval(texts)]
def
obtain_review_texts
(
path
,
title_col
,
review_col
):
file
=
pd
.
read_csv
(
path
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
return
[(
row
[
title_col
],
row
[
review_col
])
for
_
,
row
in
file
.
iterrows
()
if
not
pd
.
isnull
(
row
[
title_col
])
and
not
pd
.
isnull
(
row
[
review_col
])]
class
TargetExtractor
:
PHRASE_THRESHOLD
=
4
MIN_RELATEDNESS
=
0.3
N_ASPECTS
=
3
0
N_ASPECTS
=
5
0
MIN_DIRECT_GAIN
=
0.1
DEPTH_COST
=
0.3
FREQ_OVER_PARENT
=
3
# target must appear x times more frequently than in parent
FREQ_OVER_PARENT
=
10
# target must appear x times more frequently than in parent
OUTLIER_COEFFICIENT
=
5
N_DIRECT_FEATURES
=
3
# top N_DIRECT_FEATURES features will be direct children of the product (not subfeatures)
PARENT_COUNT_FRAC
=
0.5
# feature f1 will only be considered as a subfeature of f2 if c(f1) / c(f2) > this value
# word2vec
MIN_TERM_COUNT
=
100
MIN_SIMILARITY
=
0
SYNONYM_SIMILARITY
=
0.
7
SYNONYM_SIMILARITY
=
0.
1
# parent is a TargetExtrator of a parent category, eg. > electronics > camera
def
__init__
(
self
,
product
,
texts
,
parent
=
None
):
self
.
product
=
product
self
.
parent
=
parent
print
(
'
tokenizing phrases...
'
)
# tokenize and normalize phrases
self
.
phrases
=
[[
w
.
lower
()
for
w
in
word_tokenize
(
phrase
.
replace
(
'
_
'
,
'
'
))]
for
phrase
in
text
s
]
for
text
in
texts
for
phrase
in
sent_tokenize
(
text
)
]
print
(
'
obtaining bigrams...
'
)
# train bigram map
tokenized_phrases
=
Phrases
(
self
.
phrases
)
self
.
bigrammer
=
Phraser
(
tokenized_phrases
)
bigram
=
Phrases
(
self
.
phrases
,
threshold
=
TargetExtractor
.
PHRASE_THRESHOLD
)
trigram
=
Phrases
(
bigram
[
self
.
phrases
],
threshold
=
TargetExtractor
.
PHRASE_THRESHOLD
)
self
.
phraser
=
Phraser
(
trigram
)
print
(
'
counting terms...
'
)
# count terms
self
.
counter
=
self
.
count_nouns
()
self
.
total_count
=
sum
(
self
.
counter
.
values
())
def
get_tree_and_synonyms
(
self
):
print
(
'
training word2vec model...
'
)
# train word2vec model
wv
=
self
.
get_word2vec_model
()
self
.
wv
=
self
.
get_word2vec_model
()
print
(
'
mining aspects...
'
)
# mine aspects
aspects
,
counts
=
self
.
get_related_nouns
(
self
.
counter
,
wv
)
print
(
aspects
)
aspects
,
counts
=
self
.
get_related_nouns
(
self
.
counter
,
self
.
wv
)
print
(
'
extracting synonyms...
'
)
# obtain synonyms
syn_pairs
=
self
.
get_syn_pairs
(
aspects
,
wv
)
print
(
syn_pairs
)
syn_pairs
=
self
.
get_syn_pairs
(
aspects
,
self
.
wv
)
synset
=
Synset
(
aspects
,
syn_pairs
)
syn_dict
=
synset
.
get_dict
(
counts
)
self
.
syn_dict
=
synset
.
get_dict
(
counts
)
# remove aspect synonyms
aspects
=
[
aspect
for
aspect
in
aspects
if
aspect
in
syn_dict
.
keys
()]
counts
=
{
aspect
:
sum
(
counts
[
syn
]
for
syn
in
syn_dict
[
aspect
])
for
aspect
,
count
in
counts
.
items
()
if
aspect
in
aspects
}
# remove aspect synonyms
and reorder list based on sum of all synonym counts
aspects
=
[
aspect
for
aspect
in
aspects
if
aspect
in
self
.
syn_dict
.
keys
()]
self
.
counts
=
{
aspect
:
sum
(
counts
[
syn
]
for
syn
in
self
.
syn_dict
[
aspect
])
for
aspect
in
aspects
}
self
.
aspects
=
sorted
(
aspects
,
key
=
self
.
counts
.
get
,
reverse
=
True
)
print
(
self
.
aspects
)
print
(
self
.
syn_dict
)
print
(
'
extracting relatedness matrix...
'
)
# extract relationships between aspects
relatedness_matrix
=
self
.
get_relations
(
aspects
,
counts
,
syn_dict
)
self
.
relatedness_matrix
=
self
.
get_
scaled_
relations
()
print
(
'
extracting aspect tree...
'
)
# extract aspect tree
tree
=
TargetExtractor
.
spanning_tree_from_root
(
aspects
,
relatedness_matrix
)
self
.
tree
=
self
.
get_product_tree2
()
return
self
.
tree
,
self
.
syn_dict
def
get_scaled_relations
(
self
):
relatedness_matrix
=
np
.
ma
.
zeros
((
len
(
self
.
aspects
),
len
(
self
.
aspects
)))
relatedness_matrix
.
mask
=
False
for
tokenized_phrase
in
self
.
phrases
:
phrase
=
self
.
phraser
[
tokenized_phrase
]
matches
=
{
a_idx
:
phrase_idxs
for
a_idx
,
phrase_idxs
in
{
a_idx
:
[
phrase_idx
for
syn
in
self
.
syn_dict
[
aspect
]
for
phrase_idx
,
term
in
enumerate
(
phrase
)
if
syn
==
term
]
for
a_idx
,
aspect
in
enumerate
(
self
.
aspects
)}.
items
()
if
len
(
phrase_idxs
)
>
0
}
if
len
(
matches
)
!=
2
:
continue
(
idx1
,
p_idxs1
),
(
idx2
,
p_idxs2
)
=
matches
.
items
()
combiner_indices
=
{(
min
(
idx1
,
idx2
)
+
1
,
max
(
idx1
,
idx2
)
-
1
)
for
idx1
in
p_idxs1
for
idx2
in
p_idxs2
if
abs
(
idx1
-
idx2
)
<
10
}
if
any
(
any
(
combiner
in
'
'
.
join
(
phrase
[
start_idx
:(
end_idx
+
1
)])
for
combiner
in
[
'
and
'
,
'
as well as
'
,
'
in addition to
'
])
for
start_idx
,
end_idx
in
combiner_indices
):
continue
relatedness_matrix
[
idx1
][
idx2
]
+=
1
relatedness_matrix
[
idx2
][
idx1
]
+=
1
for
idx1
,
t1
in
enumerate
(
self
.
aspects
):
for
idx2
,
t2
in
enumerate
(
self
.
aspects
):
relatedness_matrix
[
idx1
][
idx2
]
=
relatedness_matrix
[
idx1
][
idx2
]
/
(
self
.
counts
[
t1
]
*
math
.
sqrt
(
self
.
counts
[
t2
]))
# math.sqrt(pow(counts[t1], 2) + pow(counts[t2], 2))
# mask value if it will not be considered as a parent
if
idx1
==
idx2
or
self
.
counts
[
t2
]
/
self
.
counts
[
t1
]
<
TargetExtractor
.
PARENT_COUNT_FRAC
:
relatedness_matrix
[
idx1
][
idx2
]
=
np
.
ma
.
masked
relatedness_matrix
=
np
.
divide
(
relatedness_matrix
,
np
.
amax
(
relatedness_matrix
))
return
t
re
e
,
syn_dict
return
re
latedness_matrix
def
get_relations
(
self
,
targets
,
counts
,
syn_dict
):
pair_counts
=
{
pair
:
0
for
pair
in
itertools
.
combinations
(
targets
,
2
)}
relatedness_matrix
=
np
.
zeros
((
len
(
targets
),
len
(
targets
)))
for
phrase
in
self
.
phrases
:
bigrams
=
self
.
bigrammer
[
phrase
]
for
pair
in
pair_counts
:
t1
,
t2
=
pair
if
(
any
(
term
in
bigrams
for
term
in
syn_dict
[
t1
])
and
any
(
term
in
bigrams
for
term
in
syn_dict
[
t2
])):
pair_counts
[
pair
]
+=
1
for
tokenized_phrase
in
self
.
phrases
:
phrase
=
self
.
phraser
[
tokenized_phrase
]
matches
=
{
t_idx
for
t_idx
,
target
in
enumerate
(
targets
)
if
any
(
t
in
phrase
for
t
in
syn_dict
[
target
])}
for
idx1
,
idx2
in
{
frozenset
((
idx1
,
idx2
))
for
idx1
in
matches
for
idx2
in
matches
if
idx1
!=
idx2
}:
relatedness_matrix
[
idx1
][
idx2
]
+=
1
relatedness_matrix
[
idx2
][
idx1
]
+=
1
counts_arr
=
np
.
zeros
(
len
(
targets
))
for
idx
,
target
in
enumerate
(
targets
):
counts_arr
[
idx
]
=
counts
[
target
]
relatedness_matrix
=
(
relatedness_matrix
.
T
/
counts_arr
).
T
return
relatedness_matrix
def
get_exclusive_relations
(
self
,
targets
,
counts
,
syn_dict
):
relatedness_matrix
=
np
.
zeros
((
len
(
targets
),
len
(
targets
)))
for
row
in
range
(
0
,
len
(
targets
)
-
1
):
for
col
in
range
(
row
+
1
,
len
(
targets
)):
t1
=
targets
[
row
]
t2
=
targets
[
col
]
score
=
pair_counts
[(
t1
,
t2
)]
/
(
counts
[
t1
]
*
counts
[
t2
])
relatedness_matrix
[
row
][
col
]
=
score
for
tokenized_phrase
in
self
.
phrases
:
phrase
=
self
.
phraser
[
tokenized_phrase
]
matches
=
{
t_idx
for
t_idx
,
target
in
enumerate
(
targets
)
if
any
(
t
in
phrase
for
t
in
syn_dict
[
target
])}
if
len
(
matches
)
==
2
:
idx1
,
idx2
=
matches
relatedness_matrix
[
idx1
][
idx2
]
+=
1
relatedness_matrix
[
idx2
][
idx1
]
+=
1
for
col
in
range
(
0
,
len
(
targets
)
-
1
):
for
row
in
range
(
col
+
1
,
len
(
targets
)
)
:
relatedness_matrix
[
row
][
col
]
=
relatedness_matrix
[
col
][
row
]
counts_arr
=
np
.
zeros
(
len
(
targets
)
)
for
idx
,
target
in
enumerate
(
targets
):
counts_arr
[
idx
]
=
counts
[
target
]
relatedness_matrix
=
np
.
divide
(
relatedness_matrix
,
np
.
amax
(
relatedness_matrix
))
relatedness_matrix
=
(
relatedness_matrix
.
T
/
counts_arr
).
T
return
relatedness_matrix
def
count_nouns
(
self
):
...
...
@@ -113,16 +172,21 @@ class TargetExtractor:
for
phrase
in
self
.
phrases
:
pos_tags
=
pos_tag
(
phrase
)
bigrams
=
[
re
.
sub
(
'
_*
'
+
self
.
product
+
'
_*
'
,
''
,
bigram
)
if
bigram
!=
self
.
product
else
bigram
for
bigram
in
self
.
bigramm
er
[
phrase
]]
for
bigram
in
self
.
phras
er
[
phrase
]]
word_idx
=
0
for
token
in
bigrams
:
if
'
_
'
in
token
:
words
=
token
.
split
(
'
_
'
)
if
any
(
TargetExtractor
.
is_noun
(
pos_tags
[
i
])
for
i
in
range
(
word_idx
,
word_idx
+
len
(
words
))):
word_range
=
range
(
word_idx
,
word_idx
+
len
(
words
))
has_noun
=
any
(
TargetExtractor
.
is_noun
(
pos_tags
[
i
])
for
i
in
word_range
)
all_terms_valid
=
all
(
TargetExtractor
.
is_valid_term
(
pos_tags
[
i
])
for
i
in
word_range
)
if
has_noun
and
all_terms_valid
:
nouns
.
append
(
TargetExtractor
.
singular
(
token
))
word_idx
+=
len
(
words
)
else
:
if
len
(
token
)
>
1
and
TargetExtractor
.
is_noun
(
pos_tags
[
word_idx
]):
is_noun
=
TargetExtractor
.
is_noun
(
pos_tags
[
word_idx
])
is_valid
=
TargetExtractor
.
is_valid_term
(
pos_tags
[
word_idx
])
if
len
(
token
)
>
1
and
is_noun
and
is_valid
:
nouns
.
append
(
TargetExtractor
.
singular
(
token
))
word_idx
+=
1
...
...
@@ -134,11 +198,12 @@ class TargetExtractor:
term_counts
=
[]
while
len
(
term_counts
)
<
TargetExtractor
.
N_ASPECTS
:
term
,
count
=
common
.
pop
(
0
)
print
(
term
,
count
)
# filter terms not related to the product
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
if
self
.
is_related_to_product
(
term
,
wv
):
term_counts
.
append
((
term
,
count
))
print
(
'
accepted:
'
,
term
)
else
:
print
(
'
rejected:
'
,
term
)
terms
=
[
term
for
term
,
count
in
term_counts
]
# bring product to front of list
...
...
@@ -155,17 +220,31 @@ class TargetExtractor:
(
not
self
.
parent
or
self
.
parent
.
frequency_for_term
(
term
)
==
0
or
self
.
frequency_for_term
(
term
)
/
self
.
parent
.
frequency_for_term
(
term
)
>
TargetExtractor
.
FREQ_OVER_PARENT
))
# cnet.get_relatedness(term, self.product) > TargetExtractor.MIN_RELATEDNESS
@staticmethod
def
get_syn_pairs
(
terms
,
model
):
return
{
frozenset
((
t1
,
t2
))
for
t1
in
terms
for
t2
in
terms
if
t1
!=
t2
and
model
.
similarity
(
t1
,
t2
)
>
TargetExtractor
.
SYNONYM_SIMILARITY
}
if
t1
!=
t2
and
model
.
relative_cosine_
similarity
(
t1
,
t2
)
>
TargetExtractor
.
SYNONYM_SIMILARITY
}
def
frequency_for_term
(
self
,
term
):
return
self
.
counter
[
term
]
/
self
.
total_count
def
get_word2vec_model
(
self
):
return
Word2Vec
(
self
.
phrases
,
min_count
=
5
).
wv
model
=
Word2Vec
(
self
.
phraser
[
self
.
phrases
],
min_count
=
TargetExtractor
.
MIN_TERM_COUNT
).
wv
return
model
def
save
(
self
):
f
=
open
(
self
.
product
+
'
_extractor.pickle
'
,
'
wb
'
)
pickle
.
dump
(
self
,
f
)
f
.
close
()
@staticmethod
def
load_saved
(
product
):
f
=
open
(
product
+
'
_extractor.pickle
'
,
'
rb
'
)
extractor
=
pickle
.
load
(
f
)
f
.
close
()
return
extractor
@staticmethod
def
wordnet_relatedness
(
t1
,
t2
):
...
...
@@ -192,6 +271,70 @@ class TargetExtractor:
return
root
# product has to be at idx 0
# targets have to be sorted in descending order based on counts (excluding product)
@staticmethod
def
get_product_tree
(
dep_matrix
,
targets
):
remaining_targets
=
[
idx
for
idx
in
range
(
len
(
targets
))]
root
=
Node
(
targets
[
remaining_targets
.
pop
(
0
)])
n_null
=
3
dependencies
=
[
None
]
*
n_null
+
[
TargetExtractor
.
get_significant_dependence
(
idx
,
dep_matrix
,
ignore_idx
=
[
0
]
+
list
(
range
(
idx
+
1
,
len
(
targets
))))
for
idx
in
range
(
n_null
,
len
(
targets
))]
print
(
dependencies
)
while
remaining_targets
:
idx
=
remaining_targets
.
pop
(
0
)
t_node
=
Node
(
targets
[
idx
],
parent
=
root
)
t_node
.
idx
=
idx
dependants
=
[(
d_idx
,
dep
[
1
])
for
d_idx
,
dep
in
enumerate
(
dependencies
)
if
dep
and
dep
[
0
]
==
idx
]
print
(
t_node
,
[
targets
[
i
]
for
i
,
_
in
dependants
])
for
d_idx
,
_
in
sorted
(
dependants
,
key
=
lambda
p
:
p
[
1
],
reverse
=
True
):
if
d_idx
not
in
remaining_targets
:
continue
# parent = root
# if not t_node.children or any(TargetExtractor.are_correlated(d_idx, c.idx, dep_matrix, ignore_idx=[0, idx]) for c in t_node.children):
# parent = t_node
d_node
=
Node
(
targets
[
d_idx
],
parent
=
t_node
)
d_node
.
idx
=
d_idx
remaining_targets
.
remove
(
d_idx
)
# for idx, t in enumerate(targets):
# if idx == 0:
# continue
# dep_idx =
# parent = next((d for d in root.descendants if d.idx == dep_idx), root)
# node = Node(t, parent=parent)
# node.idx = idx
return
root
def
get_product_tree2
(
self
):
root
=
Node
(
self
.
aspects
[
0
])
for
idx
in
range
(
1
,
TargetExtractor
.
N_DIRECT_FEATURES
+
1
):
node
=
Node
(
self
.
aspects
[
idx
],
parent
=
root
)
node
.
idx
=
idx
unassigned
=
{
idx
for
idx
in
range
(
TargetExtractor
.
N_DIRECT_FEATURES
+
1
,
len
(
self
.
aspects
))}
for
idx
in
range
(
1
,
len
(
self
.
aspects
)):
# for each feature in order from highest to lowest count
print
(
self
.
aspects
[
idx
])
# create node for aspect with parent root if it is unassigned
node
=
next
((
n
for
n
in
root
.
descendants
if
n
.
idx
==
idx
),
None
)
if
not
node
:
node
=
Node
(
self
.
aspects
[
idx
],
parent
=
root
)
node
.
idx
=
idx
unassigned
.
remove
(
idx
)
# get highest dependant from unassigned aspects if there exists a significant one
dep_idx
=
self
.
get_dependant
(
idx
,
[],
unassigned
)
while
dep_idx
is
not
None
:
print
(
'
'
,
self
.
aspects
[
dep_idx
])
# assign dep as subfeature of t
dep_node
=
Node
(
self
.
aspects
[
dep_idx
],
parent
=
node
)
dep_node
.
idx
=
dep_idx
unassigned
.
remove
(
dep_idx
)
# get highest dependant from remaining targets if there exists a significant one
dep_idx
=
self
.
get_dependant
(
idx
,
[
child
.
idx
for
child
in
node
.
children
],
unassigned
)
return
root
@staticmethod
def
kruskal
(
vertices
,
edges
):
result
=
set
()
...
...
@@ -217,6 +360,81 @@ class TargetExtractor:
word
,
tag
=
pos_tagged
return
tag
.
startswith
(
'
NN
'
)
and
word
.
lower
()
not
in
string
.
punctuation
and
word
not
in
stop_words
# true if term is not a preposition and does not include special characters
@staticmethod
def
is_valid_term
(
pos_tagged
):
alpha_numeric_pat
=
'
^\w+$
'
word
,
tag
=
pos_tagged
return
tag
!=
'
IN
'
and
re
.
match
(
alpha_numeric_pat
,
word
)
@staticmethod
def
print_relations
(
target_indices
,
dep_matrix
,
targets
):
idx_pairs
=
{
frozenset
((
idx1
,
idx2
))
for
idx1
in
target_indices
for
idx2
in
target_indices
if
idx1
!=
idx2
}
for
idx1
,
idx2
in
idx_pairs
:
t1
=
targets
[
idx1
]
t2
=
targets
[
idx2
]
print
(
'
{} {:.4f} {}
'
.
format
(
t1
,
dep_matrix
[
idx1
][
idx2
],
t2
))
print
(
'
{} {:.4f} {}
'
.
format
(
'
'
*
len
(
t1
),
dep_matrix
[
idx2
][
idx1
],
'
'
*
len
(
t2
)))
print
(
''
)
def
print_relations_from
(
self
,
aspect
):
idx
=
self
.
aspects
.
index
(
aspect
)
rels
=
self
.
relatedness_matrix
[
idx
].
copy
()
rels
.
mask
=
False
for
rel_idx
in
sorted
(
range
(
len
(
self
.
aspects
)),
key
=
lambda
i
:
rels
[
i
],
reverse
=
True
):
print
(
'
{:.4f}
'
.
format
(
rels
[
rel_idx
]),
self
.
aspects
[
rel_idx
])
def
get_dependant
(
self
,
idx
,
child_indices
,
unassigned_indices
):
ignore_idx
=
[
0
]
+
child_indices
max_dependant
=
(
None
,
0
)
for
u_idx
in
unassigned_indices
:
# print(' ', self.aspects[u_idx])
dependence
=
self
.
get_significant_dependence
(
u_idx
,
ignore_idx
=
ignore_idx
)
if
dependence
is
not
None
:
dep_idx
,
score
=
dependence
# print(' ', self.aspects[dep_idx], score)
if
dep_idx
==
idx
and
score
>
max_dependant
[
1
]:
max_dependant
=
(
u_idx
,
score
)
return
max_dependant
[
0
]
def
get_significant_dependence
(
self
,
idx
,
ignore_idx
=
None
):
if
not
ignore_idx
:
ignore_idx
=
[
0
]
deps
=
self
.
relatedness_matrix
[
idx
].
copy
()
for
i
in
ignore_idx
:
deps
[
i
]
=
np
.
ma
.
masked
fst_high_outlier
=
TargetExtractor
.
high_outlier_idx
(
deps
)
# print(' ', fst_high_outlier)
if
fst_high_outlier
is
not
None
:
deps
[
fst_high_outlier
[
0
]]
=
np
.
ma
.
masked
snd_high_outlier
=
TargetExtractor
.
high_outlier_idx
(
deps
)
# np.delete(deps, fst_high_outlier)
# print(' ', snd_high_outlier)
if
snd_high_outlier
is
None
:
return
fst_high_outlier
return
None
@staticmethod
def
high_outlier_idx
(
arr
):
q1
=
np
.
nanquantile
(
np
.
ma
.
filled
(
arr
,
np
.
NaN
),
0.25
)
q3
=
np
.
nanquantile
(
np
.
ma
.
filled
(
arr
,
np
.
NaN
),
0.75
)
max_idx
=
np
.
nanargmax
(
np
.
ma
.
filled
(
arr
,
np
.
NaN
))
lim
=
q3
+
TargetExtractor
.
OUTLIER_COEFFICIENT
*
(
q3
-
q1
)
# print(' ', arr, arr[max_idx], q1, q3, lim)
return
(
max_idx
,
arr
[
max_idx
]
/
lim
)
if
arr
[
max_idx
]
>
lim
else
None
@staticmethod
def
are_correlated
(
idx1
,
idx2
,
dep_matrix
,
ignore_idx
=
None
):
if
not
ignore_idx
:
ignore_idx
=
[]
mask
=
[
i
in
ignore_idx
for
i
in
range
(
len
(
dep_matrix
[
idx1
]))]
deps1
=
np
.
ma
.
masked_array
(
dep_matrix
[
idx1
],
mask
=
mask
)
deps2
=
np
.
ma
.
masked_array
(
dep_matrix
[
idx2
],
mask
=
mask
)
return
TargetExtractor
.
in_q3
(
deps1
,
idx2
)
and
TargetExtractor
.
in_q3
(
deps2
,
idx1
)
@staticmethod
def
in_q3
(
arr
,
idx
):
return
arr
[
idx
]
>=
np
.
quantile
(
arr
,
0.75
)
class
Synset
:
...
...
@@ -274,11 +492,29 @@ class Synset:
return
None
laptop
_texts
=
obtain_texts
(
'
data/
laptop
_reviews.tsv
'
,
'
review_body
'
)
laptop
_extractor
=
TargetExtractor
(
'
laptop
'
,
laptop
_texts
)
camera_
texts
=
obtain_texts
(
'
data/
camera_prepared_data
.tsv
'
,
'
review
_body
'
)
camera_
extractor
=
TargetExtractor
(
'
camera
'
,
camera_
texts
,
parent
=
laptop
_extractor
)
tree
,
syns
=
camera_
extractor
.
get_tree_and_synonyms
()
electronics
_texts
=
obtain_texts
(
'
data/
electronics
_reviews.tsv
'
,
'
review_body
'
)
[:
300000
]
electronics
_extractor
=
TargetExtractor
(
'
device
'
,
electronics
_texts
)
texts
=
obtain_texts
(
'
data/
verified_laptop_reviews
.tsv
'
,
'
review
Text
'
)
extractor
=
TargetExtractor
(
'
laptop
'
,
texts
,
parent
=
electronics
_extractor
)
tree
,
syns
=
extractor
.
get_tree_and_synonyms
()
print
(
RenderTree
(
tree
))
print
(
syns
)
extractor
.
save
()
# np.set_printoptions(precision=4, suppress=True, threshold=np.inf)
# extractor: TargetExtractor = TargetExtractor.load_saved()
# extractor.relatedness_matrix = extractor.get_scaled_relations()
# tree, _ = extractor.get_tree_and_synonyms()
# print(RenderTree(tree))
# print(extractor.aspects)
# print(extractor.relatedness_matrix)
# extractor.save()
# print(extractor.aspects)
# print(extractor.relatedness_matrix)
# extractor.save()
# for a in ['lcd_screen', 'viewfinder', 'lens', 'image_stabilization']:
# print(a)
# extractor.print_relations_from(a)
# print(extractor.counts['lcd_screen'], extractor.counts['viewfinder'])
# print(RenderTree(extractor.get_product_tree2()))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment