Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
ba2d99f3
Commit
ba2d99f3
authored
Jun 08, 2020
by
Joel Oksanen
Browse files
Minor eval changes
parent
a2ecd08d
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
ADA/server/agent/dataloader.py
View file @
ba2d99f3
...
...
@@ -2,7 +2,7 @@ import pandas as pd
class
DataLoader
:
data_location
=
'agent/amazon_data/reviews_for_
backpack
.tsv'
data_location
=
'agent/amazon_data/reviews_for_
watches
.tsv'
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
def
get_reviews
(
self
,
product_id
):
...
...
ADA/server/agent/prep_metadata.py
View file @
ba2d99f3
import
pandas
as
pd
import
gzip
import
json
MAX_ITEMS
=
200000
def
parse
(
path
):
g
=
gzip
.
open
(
path
,
'rb'
)
for
line
in
g
:
yield
json
.
loads
(
line
)
def
get_df
(
path
):
i
=
0
df
=
{}
for
d
in
parse
(
path
):
df
[
i
]
=
d
i
+=
1
if
i
==
MAX_ITEMS
:
break
return
pd
.
DataFrame
.
from_dict
(
df
,
orient
=
'index'
)
pd
.
set_option
(
'display.max_colwidth'
,
None
)
category
=
'Cardigans'
metadata_iter
=
pd
.
read_json
(
'amazon_data/meta_Clothing_Shoes_and_Jewelry.json'
,
lines
=
True
,
chunksize
=
1000
)
metadata
=
pd
.
concat
([
metadata
[
metadata
[
'category'
].
apply
(
lambda
cl
:
type
(
cl
)
is
list
and
category
in
cl
)]
for
metadata
in
metadata_iter
])
def
get_reviews
(
category
,
meta_file
,
review_file
):
metadata_iter
=
pd
.
read_json
(
meta_file
,
lines
=
True
,
chunksize
=
1000
)
metadata
=
pd
.
concat
([
metadata
[
metadata
[
'category'
].
apply
(
lambda
cl
:
type
(
cl
)
is
list
and
category
in
cl
)]
for
metadata
in
metadata_iter
])
print
(
len
(
metadata
.
index
))
print
(
len
(
metadata
.
index
))
review_iter
=
pd
.
read_json
(
review_file
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'asin'
].
isin
(
metadata
[
'asin'
])]
for
reviews
in
review_iter
])
print
(
len
(
reviews
.
index
))
review_iter
=
pd
.
read_json
(
'amazon_data/Clothing_Shoes_and_Jewelry.json'
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'asin'
].
isin
(
metadata
[
'asin'
])]
for
reviews
in
review_iter
])
return
reviews
print
(
len
(
reviews
.
index
))
reviews
.
to_csv
(
'target_extraction/data/verified_cardigan_reviews.tsv'
,
sep
=
'
\t
'
,
index
=
False
)
def
save_reviews
(
category
,
meta_file
,
review_file
,
output_file
):
reviews
=
get_reviews
(
category
,
meta_file
,
review_file
)
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
# child_product = 'speaker'
# reviews = pd.read_csv('amazon_data/amazon_reviews_us_Electronics_v1_00.tsv.gz', sep='\t', error_bad_lines=False,
# compression='gzip')
# parent_output = 'target_extraction/data/electronics_reviews.tsv'
# child_output = 'target_extraction/data/' + child_product + '_reviews.tsv'
#
# for col in reviews.columns:
# print(col)
#
# c_reviews = reviews[reviews['product_title'].str.contains(child_product, case=False, na=False)]
# p_reviews = reviews[~reviews['product_title'].str.contains(child_product, case=False, na=False)]
# c_reviews = c_reviews.sample(frac=1).reset_index(drop=True)
# c_reviews = c_reviews.head(MAX_ITEMS)
# p_reviews = p_reviews.sample(frac=1).reset_index(drop=True)
# p_reviews = p_reviews.head(MAX_ITEMS)
#
# p_reviews.to_csv(parent_output, sep='\t', index=False)
# c_reviews.to_csv(child_output, sep='\t', index=False)
# print('Successfully prepared data for', len(p_reviews.index), 'parent and', len(c_reviews.index), 'child reviews')
# # get metadata for sunglasses
# metadata = metadata[metadata['title'].str.contains(pat='[G]uitar', na=False, regex=True)]
def
save_top_reviewed_products
(
n
,
category
,
meta_file
,
review_file
,
output_file
,
product_title
):
reviews
=
get_reviews
(
category
,
meta_file
,
review_file
)
top_reviewed
=
reviews
.
groupby
([
'asin'
],
sort
=
False
).
size
().
sort_values
(
ascending
=
False
).
head
(
n
)
reviews
=
reviews
[
reviews
[
'asin'
].
apply
(
lambda
asin
:
asin
in
top_reviewed
)]
reviews
=
reviews
.
rename
(
columns
=
{
'overall'
:
'star_rating'
,
'asin'
:
'product_id'
,
'reviewerID'
:
'review_id'
,
'reviewText'
:
'review_body'
})
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
b
:
b
is
not
None
and
len
(
b
)
>
0
)]
reviews
=
reviews
[
reviews
[
'star_rating'
].
apply
(
lambda
r
:
type
(
r
)
is
int
or
r
.
isdigit
())]
reviews
[
'product_title'
]
=
product_title
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
# # get metadata for camera products
# metadata = metadata[metadata['main_cat'] == 'Camera & Photo']
#
# # try to filter out camera accessories
# filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', 'book', 'filter', 'light', 'drive',
# 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security', 'cctv', 'cassette']
# filter_pat = ''
# for word in filter_words:
# word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
# filter_pat += word_filter + '|'
# filter_pat = filter_pat[:-1]
# r = re.compile(filter_pat)
# metadata = metadata[~metadata['title'].str.contains(pat=filter_pat, na=False, regex=True)]
# metadata = metadata[~metadata['category'].apply(lambda cats: any(r.search(cat) for cat in cats))]
#
# for _, row in metadata.head(20).iterrows():
# print('features:', row['feature'])
# print('description:', row['description'])
# print('tech1:', row['tech1'])
# print('tech2:', row['tech2'])
# save_top_reviewed_products(3, 'Wrist Watches', 'amazon_data/meta_Clothing_Shoes_and_Jewelry.json',
# 'amazon_data/Clothing_Shoes_and_Jewelry.json', 'amazon_data/reviews_for_watches.tsv',
# 'watch')
save_reviews
(
'Necklaces'
,
'agent/amazon_data/meta_Clothing_Shoes_and_Jewelry.json'
,
'agent/amazon_data/Clothing_Shoes_and_Jewelry.json'
,
'agent/target_extraction/data/verified_necklace_reviews.tsv'
)
ADA/server/agent/target_extraction/BERT/entity_extractor/bert_entity_extractor.py
View file @
ba2d99f3
...
...
@@ -20,7 +20,7 @@ LEARNING_RATE = 0.00002
MAX_GRAD_NORM
=
1.0
# training
N_EPOCHS
=
2
N_EPOCHS
=
3
BATCH_SIZE
=
32
WARM_UP_FRAC
=
0.05
...
...
@@ -48,13 +48,13 @@ class BertEntityExtractor:
return
extractor
@
staticmethod
def
train_and_validate
(
file_path
,
save_
path
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
def
train_and_validate
(
file_path
,
save_
file
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
extractor
=
BertEntityExtractor
()
extractor
.
train_with_file
(
file_path
,
save_
path
,
size
=
size
,
valid_frac
=
valid_frac
,
extractor
.
train_with_file
(
file_path
,
save_
file
,
size
=
size
,
valid_frac
=
valid_frac
,
valid_file_path
=
valid_file_path
)
return
extractor
def
train_with_file
(
self
,
file_path
,
save_
path
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
def
train_with_file
(
self
,
file_path
,
save_
file
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
# load training data
if
valid_file_path
is
None
:
train_data
,
valid_data
=
EntityDataset
.
from_file
(
file_path
,
size
=
size
,
valid_frac
=
valid_frac
)
...
...
@@ -122,11 +122,11 @@ class BertEntityExtractor:
if
valid_data
is
not
None
:
self
.
evaluate
(
data
=
valid_data
)
torch
.
save
(
self
.
net
.
state_dict
(),
'{}.pt'
.
format
(
save_file
))
end
=
time
.
time
()
print
(
'Training took'
,
end
-
start
,
'seconds'
)
torch
.
save
(
self
.
net
.
state_dict
(),
save_path
)
def
evaluate
(
self
,
file_path
=
None
,
data
=
None
,
size
=
None
):
# load eval data
if
file_path
is
not
None
:
...
...
ADA/server/agent/target_extraction/BERT/relation_extractor/bert_rel_extractor.py
View file @
ba2d99f3
...
...
@@ -19,7 +19,7 @@ LEARNING_RATE = 0.00002
MAX_GRAD_NORM
=
1.0
# training
N_EPOCHS
=
2
N_EPOCHS
=
3
BATCH_SIZE
=
16
WARM_UP_FRAC
=
0.05
...
...
@@ -47,13 +47,13 @@ class BertRelExtractor:
return
extractor
@
staticmethod
def
train_and_validate
(
file_path
,
save_
path
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
def
train_and_validate
(
file_path
,
save_
file
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
extractor
=
BertRelExtractor
()
extractor
.
train_with_file
(
file_path
,
save_
path
,
size
=
size
,
valid_frac
=
valid_frac
,
extractor
.
train_with_file
(
file_path
,
save_
file
,
size
=
size
,
valid_frac
=
valid_frac
,
valid_file_path
=
valid_file_path
)
return
extractor
def
train_with_file
(
self
,
file_path
,
save_
path
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
def
train_with_file
(
self
,
file_path
,
save_
file
,
size
=
None
,
valid_frac
=
None
,
valid_file_path
=
None
):
# load training data
if
valid_file_path
is
None
:
train_data
,
valid_data
=
PairRelDataset
.
from_file
(
file_path
,
size
=
size
,
valid_frac
=
valid_frac
)
...
...
@@ -121,11 +121,11 @@ class BertRelExtractor:
if
valid_data
is
not
None
:
self
.
evaluate
(
data
=
valid_data
)
torch
.
save
(
self
.
net
.
state_dict
(),
'{}.pt'
.
format
(
save_file
))
end
=
time
.
time
()
print
(
'Training took'
,
end
-
start
,
'seconds'
)
torch
.
save
(
self
.
net
.
state_dict
(),
save_path
)
def
evaluate
(
self
,
file_path
=
None
,
data
=
None
,
size
=
None
):
# load eval data
if
file_path
is
not
None
:
...
...
ADA/server/agent/target_extraction/BERT/relation_extractor/pair_rel_dataset.py
View file @
ba2d99f3
...
...
@@ -5,6 +5,7 @@ import pandas as pd
import
numpy
as
np
from
ast
import
literal_eval
from
agent.target_extraction.BERT.relation_extractor.pairbertnet
import
TRAINED_WEIGHTS
,
HIDDEN_OUTPUT_FEATURES
import
os
MAX_SEQ_LEN
=
128
RELATIONS
=
[
'/has_feature'
,
'/no_relation'
]
...
...
@@ -69,11 +70,12 @@ class PairRelDataset(Dataset):
return
dataset
@
staticmethod
def
from_file
(
path
,
valid_frac
=
None
,
size
=
None
):
if
path
.
endswith
(
'.json'
):
dataset
=
PairRelDataset
(
pd
.
read_json
(
path
,
lines
=
True
),
size
=
size
)
elif
path
.
endswith
(
'.tsv'
):
dataset
=
PairRelDataset
(
pd
.
read_csv
(
path
,
sep
=
'
\t
'
,
error_bad_lines
=
False
),
size
=
size
)
def
from_file
(
file_name
,
valid_frac
=
None
,
size
=
None
):
f
=
open
(
os
.
path
.
dirname
(
__file__
)
+
'/../data/'
+
file_name
)
if
file_name
.
endswith
(
'.json'
):
dataset
=
PairRelDataset
(
pd
.
read_json
(
f
,
lines
=
True
),
size
=
size
)
elif
file_name
.
endswith
(
'.tsv'
):
dataset
=
PairRelDataset
(
pd
.
read_csv
(
f
,
sep
=
'
\t
'
,
error_bad_lines
=
False
),
size
=
size
)
else
:
raise
AttributeError
(
'Could not recognize file type'
)
...
...
ADA/server/agent/target_extraction/entity_annotation.py
View file @
ba2d99f3
...
...
@@ -111,7 +111,7 @@ class EntityAnnotator:
os
.
system
(
'clear'
)
print
(
fg
.
li_green
+
'{}
entitie
s annotated'
.
format
(
self
.
n_annotated
)
+
fg
.
rs
)
print
(
fg
.
li_green
+
'{}
noun
s annotated'
.
format
(
self
.
n_annotated
)
+
fg
.
rs
)
print
(
''
)
print
(
fg
.
li_black
+
'root:
\'
r
\'
'
+
fg
.
rs
)
...
...
@@ -249,11 +249,12 @@ class EntityAnnotator:
def
pair_relations_for_text
(
self
,
text
,
nan_entities
=
None
):
single_tokens
=
word_tokenize
(
text
)
all_tokens
=
set
().
union
(
*
[
single_tokens
,
self
.
phraser
[
single_tokens
]])
tagged_single
=
pos_tag
(
single_tokens
)
tagged_all
=
set
().
union
(
*
[
tagged_single
,
pos_tag
(
self
.
phraser
[
single_tokens
])])
entity_mentions
=
[]
for
n
in
PreOrderIter
(
self
.
root
):
cont
,
mention
=
self
.
mention_in_text
(
all_tokens
,
node
=
n
)
cont
,
mention
=
self
.
mention_in_text
(
tagged_all
,
node
=
n
)
if
not
cont
:
# many mentions of same entity
return
None
...
...
@@ -269,7 +270,7 @@ class EntityAnnotator:
if
nan_entities
is
not
None
and
len
(
entity_mentions
)
==
1
:
nan_mention
=
None
for
term
in
nan_entities
:
cont
,
mention
=
self
.
mention_in_text
(
all_tokens
,
term
=
term
)
cont
,
mention
=
self
.
mention_in_text
(
tagged_all
,
term
=
term
)
if
not
cont
:
# many mentions of term
return
None
...
...
@@ -286,10 +287,11 @@ class EntityAnnotator:
# returns True, (synonym of node / term / None) if there is exactly one or zero such occurrence,
# otherwise False, None, None
def
mention_in_text
(
self
,
tokens
,
node
=
None
,
term
=
None
):
def
mention_in_text
(
self
,
tagged_
tokens
,
node
=
None
,
term
=
None
):
mention
=
None
for
syn
in
({
syn
.
lower
()
for
syn
in
self
.
synset
[
node
]}
if
node
is
not
None
else
{
term
}):
n_matches
=
sum
(
1
for
token
in
tokens
if
syn
==
token
.
lower
().
replace
(
'_'
,
' '
))
n_matches
=
sum
(
1
for
token
,
tag
in
tagged_tokens
if
syn
==
token
.
lower
().
replace
(
'_'
,
' '
)
and
tag
.
startswith
(
'NN'
))
if
n_matches
>
1
:
return
False
,
None
if
n_matches
==
1
:
...
...
@@ -301,11 +303,12 @@ class EntityAnnotator:
def
entity_mentions_in_text
(
self
,
text
,
all_entities
):
single_tokens
=
word_tokenize
(
text
)
all_tokens
=
set
().
union
(
*
[
single_tokens
,
self
.
phraser
[
single_tokens
]])
tagged_single
=
pos_tag
(
single_tokens
)
tagged_all
=
set
().
union
(
*
[
tagged_single
,
pos_tag
(
self
.
phraser
[
single_tokens
])])
entity_mention
=
None
for
entity
,
is_aspect
in
all_entities
:
cont
,
mention
=
self
.
mention_in_text
(
all_tokens
,
term
=
entity
)
cont
,
mention
=
self
.
mention_in_text
(
tagged_all
,
term
=
entity
)
if
not
cont
:
# many mentions of same entity
return
None
...
...
@@ -355,12 +358,5 @@ class EntityAnnotator:
return
text
,
rels
ann
:
EntityAnnotator
=
EntityAnnotator
.
load_saved
(
'acoustic_guitar_annotator.pickle'
)
ann
.
save_annotated_entities
(
'BERT/data/annotated_acoustic_guitar_review_entities.tsv'
)
ann
:
EntityAnnotator
=
EntityAnnotator
.
load_saved
(
'camera_entity_annotator.pickle'
)
ann
.
save_annotated_entities
(
'BERT/data/annotated_camera_review_entities.tsv'
)
ann
:
EntityAnnotator
=
EntityAnnotator
.
load_saved
(
'laptop_entity_annotator.pickle'
)
ann
.
save_annotated_entities
(
'BERT/data/annotated_laptop_review_entities.tsv'
)
ann
:
EntityAnnotator
=
EntityAnnotator
.
load_saved
(
'backpack_entity_annotator.pickle'
)
ann
.
save_annotated_entities
(
'BERT/data/annotated_backpack_review_entities.tsv'
)
ea
=
EntityAnnotator
.
load_saved
(
'example_annotator.pickle'
)
ea
.
annotate
()
ADA/server/agent/target_extraction/full_pass.py
0 → 100644
View file @
ba2d99f3
from
agent.target_extraction.BERT.entity_extractor.bert_entity_extractor
import
BertEntityExtractor
from
agent.target_extraction.BERT.relation_extractor.bert_rel_extractor
import
BertRelExtractor
from
agent.target_extraction.BERT.data.combine_files
import
combine_files
# from agent.target_extraction.entity_annotation import EntityAnnotator
# ann: EntityAnnotator = EntityAnnotator.load_saved('acoustic_guitar_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_acoustic_guitar_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_acoustic_guitar_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('camera_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_camera_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_camera_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('laptop_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_laptop_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_laptop_review_entities.tsv')
# ann: EntityAnnotator = EntityAnnotator.load_saved('backpack_entity_annotator.pickle')
# ann.save_annotated_pairs('BERT/data/annotated_backpack_review_pairs.tsv')
# ann.save_annotated_entities('BERT/data/annotated_backpack_review_entities.tsv')
#
# entity_files = [
# 'BERT/data/annotated_camera_review_entities.tsv',
# 'BERT/data/annotated_backpack_review_entities.tsv',
# 'BERT/data/annotated_laptop_review_entities.tsv',
# 'BERT/data/annotated_acoustic_guitar_review_entities.tsv',
# 'BERT/data/annotated_cardigan_review_entities.tsv'
# ]
# entity_output_file = 'BERT/data/camera_backpack_laptop_guitar_cardigan_review_entities_2.tsv'
#
# pair_files = [
# 'BERT/data/annotated_camera_review_pairs.tsv',
# 'BERT/data/annotated_backpack_review_pairs.tsv',
# 'BERT/data/annotated_laptop_review_pairs.tsv',
# 'BERT/data/annotated_acoustic_guitar_review_pairs.tsv',
# 'BERT/data/annotated_cardigan_review_pairs.tsv'
# ]
# pair_output_file = 'BERT/data/camera_backpack_laptop_guitar_cardigan_review_pairs_2.tsv'
#
# for n in range(1, 6):
# combine_files(entity_files[:n], 'BERT/data/review_entities_' + str(n) + '.tsv', total_size=50000)
# combine_files(pair_files[:n], 'BERT/data/review_pairs_' + str(n) + '.tsv', total_size=50000)
#
# combine_files(entity_files, entity_output_file)
# combine_files(pair_files, pair_output_file)
for
n
in
range
(
1
,
6
):
BertRelExtractor
.
train_and_validate
(
'review_pairs_'
+
str
(
n
)
+
'.tsv'
,
'rel_extractor_'
+
str
(
n
)
+
'_products'
)
BertEntityExtractor
.
train_and_validate
(
'review_entities_'
+
str
(
n
)
+
'.tsv'
,
'entity_extractor_'
+
str
(
n
)
+
'_products'
)
ADA/server/agent/target_extraction/target_extractor.py
View file @
ba2d99f3
...
...
@@ -16,16 +16,117 @@ from agent.target_extraction.BERT.entity_extractor.entity_dataset import EntityD
from
agent.target_extraction.BERT.entity_extractor.bert_entity_extractor
import
BertEntityExtractor
from
agent.target_extraction.BERT.relation_extractor.pair_rel_dataset
import
PairRelDataset
from
agent.target_extraction.BERT.relation_extractor.bert_rel_extractor
import
BertRelExtractor
from
pathos.multiprocessing
import
ProcessingPool
as
Pool
import
itertools
from
pandarallel
import
pandarallel
np
.
set_printoptions
(
precision
=
3
,
threshold
=
np
.
inf
,
suppress
=
True
)
np
.
set_printoptions
(
precision
=
4
,
threshold
=
np
.
inf
,
suppress
=
True
)
stop_words
=
stopwords
.
words
(
'english'
)
wnl
=
WordNetLemmatizer
()
sentiment_lexicon
=
pd
.
read_csv
(
'data/NRC-Sentiment-Lexicon-Wordlevel-v0.92.tsv'
,
sep
=
'
\t
'
,
index_col
=
0
)
entity_extractor_path
=
'BERT/entity_extractor/trained_bert_entity_extractor_camera_backpack_laptop.pt'
rel_extractor_path
=
'BERT/relation_extractor/trained_bert_rel_extractor_camera_backpack_laptop_no_nan.pt'
pool
=
Pool
(
2
)
entity_extractor_path
=
'BERT/entity_extractor/entity_extractor_five_products.pt'
rel_extractor_path
=
'BERT/relation_extractor/rel_extractor_five_products.pt'
pandarallel
.
initialize
()
def
ngrams
(
tagged_tokens
,
phraser
):
tokens
,
tags
=
zip
(
*
tagged_tokens
)
unfiltered
=
[
term
.
split
(
'_'
)
for
term
in
phraser
[
tokens
]]
tagged_unfiltered
=
[]
n
=
0
for
term
in
unfiltered
:
tagged_unfiltered
.
append
([(
subterm
,
list
(
tags
)[
n
+
idx
])
for
idx
,
subterm
in
enumerate
(
term
)])
n
+=
len
(
term
)
return
[
subterm
for
term
in
tagged_unfiltered
for
subterm
in
filter_ngram
(
term
)]
def
filter_ngram
(
term
):
if
len
(
term
)
>
1
and
(
any
(
not
re
.
compile
(
'NN|JJ'
).
match
(
tag
)
for
_
,
tag
in
term
)
or
any
(
tag
.
startswith
(
'JJ'
)
and
polar_adjective
(
t
)
for
t
,
tag
in
term
)):
return
[
subterm
for
subterm
,
_
in
term
]
return
[
' '
.
join
([
subterm
for
subterm
,
_
in
term
])]
def
polar_adjective
(
adj
):
return
adj
in
sentiment_lexicon
.
index
and
(
sentiment_lexicon
.
loc
[
adj
][
'positive'
]
==
1
or
sentiment_lexicon
.
loc
[
adj
][
'negative'
]
==
1
)
def
count_phrase_nouns
(
tagged_tokens
,
tagged_ngrams
):
def
is_noun
(
pos_tagged
):
word
,
tag
=
pos_tagged
return
tag
.
startswith
(
'NN'
)
and
word
not
in
string
.
punctuation
and
word
not
in
stop_words
# true if term is not a preposition and does not include special characters
def
is_valid_term
(
pos_tagged
):
alpha_numeric_pat
=
'^\w+$'
word
,
tag
=
pos_tagged
return
tag
!=
'IN'
and
re
.
match
(
alpha_numeric_pat
,
word
)
nouns
=
[]
word_idx
=
0
for
token
,
_
in
tagged_ngrams
:
if
' '
in
token
:
words
=
token
.
split
(
' '
)
word_range
=
range
(
word_idx
,
word_idx
+
len
(
words
))
has_noun
=
any
(
is_noun
(
tagged_tokens
[
i
])
for
i
in
word_range
)
all_terms_valid
=
all
(
is_valid_term
(
tagged_tokens
[
i
])
for
i
in
word_range
)
if
has_noun
and
all_terms_valid
:
nouns
.
append
(
token
)
word_idx
+=
len
(
words
)
else
:
token_is_noun
=
is_noun
(
tagged_tokens
[
word_idx
])
is_valid
=
is_valid_term
(
tagged_tokens
[
word_idx
])
if
len
(
token
)
>
1
and
token_is_noun
and
is_valid
:
nouns
.
append
(
token
)
word_idx
+=
1
return
nouns
def
entity_mentions_in_text
(
text
,
tagged_tokens
,
tagged_ngrams
,
entities
):
all_tokens
=
{
t
for
t
,
tag
in
set
().
union
(
*
[
tagged_tokens
,
tagged_ngrams
])
if
tag
.
startswith
(
'NN'
)}
entity_mention
=
None
for
entity
in
entities
:
n_mentions
=
sum
(
1
for
token
in
all_tokens
if
entity
==
token
.
lower
())
if
n_mentions
>
1
:
# many mentions of same entity
return
None
if
n_mentions
==
1
:
if
entity_mention
is
None
:
entity_mention
=
entity
elif
entity_mention
in
entity
:
entity_mention
=
entity
elif
entity
not
in
entity_mention
:
# text cannot have more than one entity mention, unless one is a subset of the other,
# in which case the longer one is taken
return
None
if
entity_mention
is
not
None
:
return
text
,
[{
'text'
:
entity_mention
}]
return
None
def
pair_relations_for_text
(
text
,
tagged_ngrams
,
aspects
,
syn_dict
):
def
overlapping_terms
(
ts
,
t
):
if
len
(
ts
)
==
0
:
return
False
return
any
(
t
in
t2
.
split
(
' '
)
if
len
(
t
)
<
len
(
t2
)
else
t2
in
t
.
split
(
' '
)
for
t2
in
ts
)
found_aspects
=
[]
for
aspect
in
aspects
:
found_form
=
False
for
form
in
syn_dict
[
aspect
]:
if
any
(
t
==
form
and
tag
.
startswith
(
'NN'
)
for
t
,
tag
in
tagged_ngrams
):
if
len
(
found_aspects
)
>
1
or
found_form
or
overlapping_terms
(
found_aspects
,
form
):
# cannot have more than two aspects, or two forms of the same aspect, or overlapping terms
return
None
found_aspects
.
append
(
form
)
found_form
=
True
return
(
text
,
[{
'em1Text'
:
found_aspects
[
0
],
'em2Text'
:
found_aspects
[
1
]}])
if
len
(
found_aspects
)
==
2
else
None
class
TargetExtractor
:
...
...
@@ -39,7 +140,7 @@ class TargetExtractor:
# word2vec
MIN_TERM_COUNT
=
100
SYNONYM_SIMILARITY
=
0.1
2
SYNONYM_SIMILARITY
=
0.1
1
SYNONYM_SIMILARITY_PRODUCT
=
0.09
WV_SIZE
=
100
WV_WINDOW
=
7
...
...
@@ -54,36 +155,36 @@ class TargetExtractor:
self
.
file_path
=
file_path
print
(
'tokenizing phrases...'
)
# tokenize and normalize phrases
texts
=
TargetExtractor
.
obtain_texts
(
file_path
,
text_column
)
self
.
sentences
=
list
(
itertools
.
chain
.
from_iterable
(
pool
.
map
(
sent_tokenize
,
texts
)))
self
.
sentences
=
pool
.
map
(
lambda
s
:
s
.
replace
(
'_'
,
' '
).
lower
(),
self
.
sentences
)
self
.
phrases
=
pool
.
map
(
word_tokenize
,
self
.
sentences
)
self
.
texts
=
TargetExtractor
.
obtain_texts
(
file_path
,
text_column
,
n
=
50000
)
# obtain normalized sentences
self
.
texts
=
self
.
texts
.
rename
(
columns
=
{
text_column
:
'sentence'
})
self
.
texts
[
'sentence'
]
=
self
.
texts
[
'sentence'
].
parallel_apply
(
sent_tokenize
)
self
.
texts
=
self
.
texts
.
explode
(
'sentence'
).
reset_index
(
drop
=
True
)
self
.
texts
[
'sentence'
]
=
self
.
texts
[
'sentence'
].
parallel_apply
(
lambda
s
:
s
.
replace
(
'_'
,
' '
).
lower
())
# obtain tokens and their pos tags in a new column
self
.
texts
[
'tokens'
]
=
self
.
texts
[
'sentence'
].
parallel_apply
(
lambda
s
:
pos_tag
(
word_tokenize
(
s
)))
print
(
'obtaining n-grams...'
)
# train bigram map
bigram
=
Phrases
(
self
.
phrases
,
threshold
=
TargetExtractor
.
PHRASE_THRESHOLD
)
trigram
=
Phrases
(
bigram
[
self
.
phrases
],
threshold
=
TargetExtractor
.
PHRASE_THRESHOLD
)
self
.
phraser
=
Phraser
(
trigram
)
self
.
ngram_phrases
=
self
.
ngrams
(
self
.
phrases
)
# train ngrams and their pos tags in a new column
tokens
=
[[
t
for
t
,
tag
in
tagged_ts
]
for
tagged_ts
in
self
.
texts
[
'tokens'
]]
bigram
=
Phrases
(
tokens
,
threshold
=
TargetExtractor
.
PHRASE_THRESHOLD
)
trigram
=
Phrases
(
bigram
[
tokens
],
threshold
=
TargetExtractor
.
PHRASE_THRESHOLD
)
phraser
=
Phraser
(
trigram
)
self
.
texts
[
'ngrams'
]
=
self
.
texts
.
apply
(
lambda
row
:
ngrams
(
row
.
tokens
,
phraser
),
axis
=
1
)
self
.
texts
[
'ngrams'
]
=
self
.
texts
[
'ngrams'
].
parallel_apply
(
lambda
t
:
pos_tag
(
t
))
print
(
'counting terms...'
)
# count terms
self
.
counter
=
self
.
count_nouns
()
self
.
total_count
=
sum
(
self
.
counter
.
values
())
self
.
save
()
print
(
'training word2vec model...'
)
# train word2vec model
self
.
wv
=
self
.
get_word2vec_model
(
size
=
TargetExtractor
.
WV_SIZE
,
window
=
TargetExtractor
.
WV_WINDOW
,
min_count
=
TargetExtractor
.
MIN_TERM_COUNT
)
self
.
save
()
print
(
'mining aspects...'
)
# mine aspects
self
.
aspects
,
self
.
counts
=
self
.
get_aspects
(
self
.
counter
)
self
.
aspects
,
self
.
counts
=
self
.
get_aspects
()
print
(
'extracting synonyms...'
)
# obtain synonyms
...
...
@@ -96,15 +197,9 @@ class TargetExtractor:
self
.
counts
=
{
aspect
:
sum
(
self
.
counts
[
syn
]
for
syn
in
self
.
syn_dict
[
aspect
])
for
aspect
in
self
.
aspects
}
self
.
aspects
=
sorted
(
self
.
aspects
,
key
=
self
.
counts
.
get
,
reverse
=
True
)
print
(
self
.
syn_dict
)
self
.
save
()
print
(
'extracting relatedness matrix...'
)
self
.
relatedness_matrix
=
self
.
get_bert_relations
()
self
.
save
()
print
(
'extracting aspect tree...'
)
self
.
tree
=
self
.
get_product_tree
()
...
...
@@ -126,52 +221,37 @@ class TargetExtractor:
@
staticmethod
def
obtain_texts
(
path
,
col
,
n
=
None
):
file
=
pd
.
read_csv
(
path
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
file
=
file
[
~
file
[
col
].
isnull
()]
if
n
and
n
<
len
(
file
.
index
):
file
=
file
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
file
=
file
.
head
(
n
)
texts
=
[
text
for
_
,
text
in
file
[
col
].
items
()
if
not
pd
.
isnull
(
text
)]
texts
=
pd
.
read_csv
(
path
,
usecols
=
[
col
],
squeeze
=
False
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
texts
=
texts
[
~
texts
[
col
].
isnull
()]
texts
=
texts
[
texts
[
col
]
!=
''
]
if
n
and
n
<
len
(
texts
.
index
):
texts
=
texts
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
texts
=
texts
.
head
(
n
)
# texts = [text for _, text in df[col].items() if not pd.isnull(text)]
print
(
'Obtained {} texts'
.
format
(
len
(
texts
)))
return
texts