Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
8416bbe7
Commit
8416bbe7
authored
Jun 21, 2020
by
Joel Oksanen
Browse files
Implementation complete
parent
e38035ff
Changes
13
Hide whitespace changes
Inline
Side-by-side
ADA/server/agent/SA/bert_analyzer.py
View file @
8416bbe7
...
...
@@ -8,6 +8,8 @@ import time
import
numpy
as
np
from
sklearn
import
metrics
device
=
torch
.
device
(
'cuda'
)
semeval_2014_test_path
=
'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path
=
'data/Amazon/annotated_amazon_laptop_reviews.xml'
...
...
@@ -43,13 +45,20 @@ class BertAnalyzer:
collate_fn
=
generate_batch
)
self
.
net
=
TDBertNet
(
len
(
polarity_indices
))
# initialise GPU
self
.
net
.
cuda
()
optimiser
=
optim
.
Adam
(
self
.
net
.
parameters
(),
lr
=
LEARNING_RATE
)
start
=
time
.
time
()
for
epoch
in
range
(
MAX_EPOCHS
):
batch_loss
=
0.0
for
i
,
(
texts
,
target_indices
,
labels
)
in
enumerate
(
train_loader
):
for
i
,
batch
in
enumerate
(
train_loader
):
# send batch to gpu
texts
,
target_indices
,
labels
=
tuple
(
i
.
to
(
device
)
for
i
in
batch
)
# zero param gradients
optimiser
.
zero_grad
()
...
...
@@ -103,10 +112,14 @@ class BertAnalyzer:
dataset
=
BertDataset
.
from_data
(
data
)
loader
=
DataLoader
(
dataset
,
batch_size
=
128
,
shuffle
=
False
,
num_workers
=
8
,
collate_fn
=
generate_batch
)
self
.
net
.
cuda
()
self
.
net
.
eval
()
predicted
=
[]
with
torch
.
no_grad
():
for
texts
,
target_indices
,
_
in
loader
:
outputs
,
attentions
=
self
.
net
(
texts
,
target_indices
)
for
input_ids
,
attn_mask
,
target_indices
,
_
in
loader
:
input_ids
,
attn_mask
,
target_indices
=
tuple
(
i
.
to
(
device
)
for
i
in
[
input_ids
,
attn_mask
,
target_indices
])
outputs
=
self
.
net
(
input_ids
,
attn_mask
,
target_indices
)
batch_val
,
batch_pred
=
torch
.
max
(
outputs
.
data
,
1
)
predicted
+=
[
BertAnalyzer
.
get_polarity
(
val
,
pred
)
for
val
,
pred
in
zip
(
batch_val
,
batch_pred
)]
...
...
ADA/server/agent/SA/bert_dataset.py
View file @
8416bbe7
...
...
@@ -12,9 +12,11 @@ MASK_TOKEN = '[MASK]'
def
generate_batch
(
batch
):
texts
=
tokenizer
.
batch_encode_plus
([
entry
[
'tokens'
]
for
entry
in
batch
],
add_special_tokens
=
True
,
encoded
=
tokenizer
.
batch_encode_plus
([
entry
[
'tokens'
]
for
entry
in
batch
],
add_special_tokens
=
True
,
max_length
=
MAX_SEQ_LEN
,
pad_to_max_length
=
True
,
is_pretokenized
=
True
,
return_tensors
=
'pt'
)
input_ids
=
encoded
[
'input_ids'
]
attn_mask
=
encoded
[
'attention_mask'
]
max_tg_len
=
max
(
entry
[
'to'
]
-
entry
[
'from'
]
for
entry
in
batch
)
target_indices
=
torch
.
tensor
([[[
min
(
t
,
entry
[
'to'
])]
*
HIDDEN_OUTPUT_FEATURES
...
...
@@ -23,7 +25,7 @@ def generate_batch(batch):
polarity_labels
=
torch
.
tensor
([
entry
[
'polarity'
]
for
entry
in
batch
])
return
texts
,
target_indices
,
polarity_labels
return
input_ids
,
attn_mask
,
target_indices
,
polarity_labels
def
token_for_char
(
char_idx
,
text
,
tokens
):
...
...
ADA/server/agent/SA/tdbertnet.py
View file @
8416bbe7
...
...
@@ -11,18 +11,17 @@ class TDBertNet(nn.Module):
def
__init__
(
self
,
num_class
):
super
(
TDBertNet
,
self
).
__init__
()
config
=
BertConfig
.
from_pretrained
(
TRAINED_WEIGHTS
,
output_attentions
=
True
)
config
=
BertConfig
.
from_pretrained
(
TRAINED_WEIGHTS
)
self
.
bert_base
=
BertModel
.
from_pretrained
(
TRAINED_WEIGHTS
,
config
=
config
)
self
.
bert_base
.
config
.
output_attentions
=
True
self
.
fc
=
nn
.
Linear
(
HIDDEN_OUTPUT_FEATURES
,
num_class
)
# n of hidden features, n of output labels
def
forward
(
self
,
texts
,
target_indices
):
def
forward
(
self
,
input_ids
,
attn_mask
,
target_indices
):
# BERT
bert_output
,
_
,
attentions
=
self
.
bert_base
(
**
texts
)
bert_output
,
_
=
self
.
bert_base
(
input_ids
=
input_ids
,
attention_mask
=
attn_mask
)
# max pooling at target locations
target_outputs
=
torch
.
gather
(
bert_output
,
dim
=
1
,
index
=
target_indices
)
pooled_output
=
torch
.
max
(
target_outputs
,
dim
=
1
)[
0
]
# fc layer with softmax activation
x
=
F
.
softmax
(
self
.
fc
(
pooled_output
),
1
)
return
x
,
attentions
[
-
1
]
return
x
ADA/server/agent/argument.py
0 → 100644
View file @
8416bbe7
class
Argument
:
def
__init__
(
self
,
text
,
polarity
,
supporters
,
attackers
,
phrase
,
size
):
self
.
text
=
text
self
.
polarity
=
'POS'
if
polarity
else
'NEG'
self
.
supporters
=
supporters
self
.
attackers
=
attackers
self
.
phrase
=
phrase
.
text
if
phrase
else
'-'
self
.
size
=
size
ADA/server/agent/communicator.py
View file @
8416bbe7
...
...
@@ -21,7 +21,7 @@ class Communicator:
def
get_init_message
(
self
):
prod_node
=
self
.
product
.
root
prod
=
self
.
product
.
argument_for_node
(
prod_node
)
text
=
ADAText
(
'What would you like to know about th
e
*?'
,
[
prod
.
name
])
text
=
ADAText
(
'What would you like to know about th
is
*?'
,
[
prod
.
name
])
queries
=
self
.
get_queries
(
prod_node
)
args
=
[
prod
.
with_queries
(
queries
)]
return
ADAMessage
(
text
,
args
)
...
...
@@ -75,9 +75,20 @@ class Communicator:
if
query_id
==
4
or
query_id
==
5
:
phrase
=
(
self
.
framework
.
best_supporting_phrase
(
q_arg_node
)
if
query_id
==
4
else
self
.
framework
.
best_attacking_phrase
(
q_arg_node
))
while
phrase
[
-
1
]
==
'.'
:
phrase
=
phrase
[:
-
1
]
text
=
ADAText
(
'
\"
...*...
\"
'
,
[
phrase
],
style
=
'QUOT'
)
template
=
''
args
=
[]
i
=
0
for
form
,
start
,
end
in
phrase
.
get_arg_mentions
(
q_arg_node
):
template
+=
phrase
.
text
[
i
:
start
]
+
'*'
i
=
end
args
.
append
(
form
)
template
+=
phrase
.
text
[
i
:
len
(
phrase
.
text
)]
while
template
[
-
1
]
==
'.'
:
template
=
template
[:
-
1
]
text
=
ADAText
(
'
\"
...{}...
\"
'
.
format
(
template
),
args
,
style
=
'QUOT'
)
args
=
[
q_arg_node
]
args
=
[
self
.
product
.
argument_for_node
(
arg
).
with_queries
(
self
.
get_queries
(
arg
))
for
arg
in
args
]
...
...
@@ -106,3 +117,6 @@ class Communicator:
def
was_were
(
self
,
arg_n
):
return
'was'
if
self
.
product
.
singularities
[
arg_n
]
else
'were'
def
get_argument_graph
(
self
):
return
self
.
framework
.
get_argument_graph
()
ADA/server/agent/dataloader.py
View file @
8416bbe7
...
...
@@ -2,7 +2,7 @@ import pandas as pd
class
DataLoader
:
data_location
=
'agent/amazon_data/
top_5_mixer_reviews_subset
.tsv'
data_location
=
'agent/amazon_data/
reviews
.tsv'
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
@
staticmethod
...
...
ADA/server/agent/framework.py
View file @
8416bbe7
from
anytree
import
PostOrderIter
from
anytree
import
PostOrderIter
,
PreOrderIter
from
functools
import
reduce
from
agent.SA.bert_analyzer
import
BertAnalyzer
from
agent.target_extraction.product
import
Product
from
agent.review
import
Review
from
agent.dataloader
import
DataLoader
from
agent.argument
import
Argument
import
pickle
import
re
from
time
import
time
class
Framework
:
...
...
@@ -19,9 +21,14 @@ class Framework:
self
.
arguments
=
self
.
product
.
argument_nodes
self
.
features
=
self
.
product
.
feature_nodes
ts
=
time
()
# get reviews
review_csv
=
DataLoader
.
get_reviews
(
product_id
)
reviews
=
[
Review
(
row
,
self
.
product
)
for
_
,
row
in
review_csv
.
iterrows
()]
reviews
=
[
Review
(
row
,
self
.
product
)
for
_
,
row
in
review_csv
.
head
(
1000
).
iterrows
()]
t_feature
=
time
()
print
(
'Feature detection took {} seconds'
.
format
(
t_feature
-
ts
))
# extract augmented votes
self
.
extract_votes
(
reviews
)
...
...
@@ -29,15 +36,25 @@ class Framework:
if
len
(
voting_reviews
)
/
len
(
reviews
)
<
0.33
:
print
(
'warning: only a small fraction of reviews generated votes'
)
t_sa
=
time
()
print
(
'Sentiment analysis took {} seconds'
.
format
(
t_sa
-
t_feature
))
# get aggregates
ra
,
self
.
vote_sum
,
self
.
vote_phrases
=
self
.
get_aggregates
(
reviews
)
t_ra
=
time
()
print
(
'Review aggregation took {} seconds'
.
format
(
t_ra
-
t_sa
))
# get qbaf from ra
self
.
qbaf
,
self
.
argument_polarities
=
self
.
get_qbaf
(
ra
,
len
(
reviews
))
# apply gradual semantics
self
.
strengths
=
self
.
get_strengths
(
self
.
qbaf
)
te
=
time
()
print
(
'QBAF construction took {} seconds'
.
format
(
te
-
t_ra
))
print
(
'Process took {} seconds'
.
format
(
te
-
ts
))
# save
self
.
save
()
...
...
@@ -78,7 +95,7 @@ class Framework:
for
review
in
reviews
:
for
phrase
in
review
.
phrases
:
for
arg
,
sentiment
in
phrase
.
get_votes
().
items
():
vote_phrases
[
arg
].
append
({
'phrase'
:
phrase
.
text
,
'sentiment'
:
sentiment
,
'n_args'
:
len
(
phrase
.
args
)}
)
vote_phrases
[
arg
].
append
(
phrase
)
#
{'phrase': phrase.text, 'sentiment': sentiment, 'n_args': len(phrase.args)}
for
arg
,
sentiment
in
review
.
get_votes
().
items
():
ra
.
append
({
'review_id'
:
review
.
id
,
'argument'
:
arg
,
'vote'
:
sentiment
})
vote_sum
[
arg
]
+=
sentiment
...
...
@@ -187,31 +204,36 @@ class Framework:
return
att
is
not
None
and
self
.
strengths
[
att
]
>
0
def
best_supporting_phrase
(
self
,
argument
):
phrases
=
{
vp
[
'
phrase
'
]:
vp
[
'sentiment'
]
for
vp
in
self
.
supporting_phrases
(
argument
)
if
vp
[
'
n_args
'
]
==
1
and
Framework
.
is_well_formatted
(
vp
[
'
phrase
'
])}
phrases
=
[
phrase
for
phrase
in
self
.
supporting_phrases
(
argument
)
if
phrase
.
n_args
()
==
1
and
Framework
.
is_well_formatted
(
phrase
.
text
)]
if
len
(
phrases
)
==
0
:
return
None
top_5
=
list
(
sorted
(
phrases
.
keys
(),
key
=
lambda
k
:
phrases
[
k
],
reverse
=
True
))[:
5
]
print
(
top_5
)
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
))
top_5
=
list
(
sorted
(
phrases
,
key
=
lambda
p
:
p
.
get_vote
(
argument
),
reverse
=
True
))[:
5
]
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
.
text
))
def
best_attacking_phrase
(
self
,
argument
):
phrases
=
{
vp
[
'
phrase
'
]:
vp
[
'sentiment'
]
for
vp
in
self
.
attacking_phrases
(
argument
)
if
vp
[
'
n_args
'
]
==
1
and
Framework
.
is_well_formatted
(
vp
[
'
phrase
'
])}
phrases
=
[
phrase
for
phrase
in
self
.
attacking_phrases
(
argument
)
if
phrase
.
n_args
()
==
1
and
Framework
.
is_well_formatted
(
phrase
.
text
)]
if
len
(
phrases
)
==
0
:
return
None
top_5
=
list
(
sorted
(
phrases
.
keys
(),
key
=
lambda
k
:
phrases
[
k
]))[:
5
]
print
(
top_5
)
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
))
top_5
=
list
(
sorted
(
phrases
,
key
=
lambda
p
:
p
.
get_vote
(
argument
)))[:
5
]
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
.
text
))
@
staticmethod
def
is_well_formatted
(
phrase
):
if
not
re
.
match
(
'^[-a-zA-Z0-9();,./!?
\'
" ]*$'
,
phrase
):
print
(
phrase
)
return
re
.
match
(
'^[-a-zA-Z0-9();,./!?
\'
" ]*$'
,
phrase
)
def
supporting_phrases
(
self
,
argument
):
return
list
(
filter
(
lambda
vp
:
vp
[
'senti
ment
'
]
>
0
,
self
.
vote_phrases
[
argument
]))
return
list
(
filter
(
lambda
phrase
:
phrase
.
get_vote
(
argu
ment
)
>
0
,
self
.
vote_phrases
[
argument
]))
def
attacking_phrases
(
self
,
argument
):
return
list
(
filter
(
lambda
vp
:
vp
[
'sentiment'
]
<
0
,
self
.
vote_phrases
[
argument
]))
return
list
(
filter
(
lambda
phrase
:
phrase
.
get_vote
(
argument
)
<
0
,
self
.
vote_phrases
[
argument
]))
def
get_argument_graph
(
self
):
return
self
.
create_arg
(
self
.
product_node
,
120
)
def
create_arg
(
self
,
arg_node
,
size
):
supporters
=
[
self
.
create_arg
(
supp_node
,
size
-
20
)
for
supp_node
in
self
.
qbaf
[
'supporters'
][
arg_node
]]
attackers
=
[
self
.
create_arg
(
att_node
,
size
-
20
)
for
att_node
in
self
.
qbaf
[
'attackers'
][
arg_node
]]
phrase
=
self
.
best_supporting_phrase
(
arg_node
)
if
self
.
argument_polarities
[
arg_node
]
else
self
.
best_attacking_phrase
(
arg_node
)
return
Argument
(
arg_node
.
name
,
self
.
argument_polarities
[
arg_node
],
supporters
,
attackers
,
phrase
,
size
)
ADA/server/agent/prep_metadata.py
View file @
8416bbe7
import
pandas
as
pd
pd
.
set_option
(
'display.max_colwidth'
,
None
)
all_reviews_file
=
'amazon_data/reviews.tsv'
def
get_reviews
(
category
,
meta_file
,
review_file
):
...
...
@@ -21,7 +22,7 @@ def save_reviews(category, meta_file, review_file, output_file):
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
def
save_top_reviewed_products
(
n
,
output_file
,
product_title
,
category
=
None
,
review_file
=
None
,
meta_file
=
None
,
def
save_top_reviewed_products
(
n
,
category
=
None
,
review_file
=
None
,
meta_file
=
None
,
product_file
=
None
):
if
product_file
:
reviews
=
pd
.
read_csv
(
product_file
,
sep
=
'
\t
'
)
...
...
@@ -33,9 +34,35 @@ def save_top_reviewed_products(n, output_file, product_title, category=None, rev
'reviewText'
:
'review_body'
})
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
b
:
not
pd
.
isna
(
b
)
and
len
(
b
)
>
0
)]
reviews
=
reviews
[
reviews
[
'star_rating'
].
apply
(
lambda
r
:
type
(
r
)
is
int
or
r
.
isdigit
())]
reviews
[
'product_title'
]
=
product_title
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
titles
=
{
asin
:
input
(
'Product title for {}: '
.
format
(
asin
))
for
asin
in
reviews
[
'product_id'
].
unique
()}
reviews
[
'product_title'
]
=
reviews
[
'product_id'
].
apply
(
lambda
asin
:
titles
[
asin
])
all_reviews
=
pd
.
read_csv
(
all_reviews_file
,
sep
=
'
\t
'
)
all_reviews
=
pd
.
concat
([
all_reviews
,
reviews
])
all_reviews
.
to_csv
(
all_reviews_file
,
sep
=
'
\t
'
,
index
=
False
)
def
get_top_products_by_brand
(
n
,
brand
,
meta_file
,
review_file
):
metadata_iter
=
pd
.
read_json
(
meta_file
,
lines
=
True
,
chunksize
=
1000
)
metadata
=
pd
.
concat
([
metadata
[
metadata
[
'brand'
].
apply
(
lambda
b
:
type
(
b
)
is
str
and
b
==
brand
)]
for
metadata
in
metadata_iter
])
print
(
len
(
metadata
.
index
))
print
(
metadata
.
head
())
print
(
metadata
.
columns
)
save_top_reviewed_products
(
5
,
'amazon_data/top_5_mixer_reviews.tsv'
,
'KitchenAid KSM150PSGR Artisan Series 5-Qt. Stand Mixer with Pouring Shield - Imperial Grey'
,
product_file
=
'target_extraction/data/verified_stand_mixer_reviews.tsv'
)
review_iter
=
pd
.
read_json
(
review_file
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'asin'
].
isin
(
metadata
[
'asin'
])]
for
reviews
in
review_iter
])
print
(
len
(
reviews
.
index
))
top_reviewed
=
reviews
.
groupby
([
'asin'
],
sort
=
False
).
size
().
sort_values
(
ascending
=
False
).
head
(
n
)
return
top_reviewed
def
get_product_reviews_for_asin
(
asin
,
review_file
,
output_file
):
review_iter
=
pd
.
read_json
(
review_file
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'asin'
].
apply
(
lambda
p_asin
:
p_asin
==
asin
)]
for
reviews
in
review_iter
])
print
(
len
(
reviews
.
index
))
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
ADA/server/agent/review.py
View file @
8416bbe7
import
re
from
nltk.tokenize
import
w
ord
_t
okenize
,
sent_tokenize
from
nltk.tokenize
import
TreebankW
ord
T
okenize
r
,
sent_tokenize
from
agent.SA.bert_dataset
import
MAX_SEQ_LEN
from
anytree
import
PostOrderIter
from
nltk.stem
import
WordNetLemmatizer
wnl
=
WordNetLemmatizer
()
tokenizer
=
TreebankWordTokenizer
()
class
Review
:
...
...
@@ -61,7 +62,8 @@ class Phrase:
def
__init__
(
self
,
text
,
product
):
self
.
product
=
product
self
.
text
=
text
self
.
tokens
=
[
word
.
lower
()
for
word
in
word_tokenize
(
text
)]
self
.
spans
=
list
(
tokenizer
.
span_tokenize
(
text
))
self
.
tokens
=
[
text
[
start
:
end
]
for
start
,
end
in
self
.
spans
]
self
.
args
=
self
.
get_args
()
self
.
votes
=
{}
...
...
@@ -72,7 +74,7 @@ class Phrase:
while
len
(
arguments
)
>
0
:
arg
=
arguments
.
pop
(
0
)
for
term
in
self
.
product
.
glossary
[
arg
]:
matches
=
[
Arg
(
arg
,
start
,
end
)
matches
=
[
Arg
(
arg
,
' '
.
join
(
term
),
start
,
end
)
for
start
,
end
in
Phrase
.
matching_subsequences
(
term
,
self
.
tokens
)]
if
matches
:
argument_matches
+=
matches
...
...
@@ -89,8 +91,8 @@ class Phrase:
pass
self
.
remove_ancestors
(
node
.
parent
,
l
)
def
add_arg
(
self
,
arg
):
self
.
args
.
append
(
arg
)
#
def add_arg(self, arg):
#
self.args.append(arg)
def
num_args
(
self
):
return
len
(
self
.
args
)
...
...
@@ -102,6 +104,20 @@ class Phrase:
self
.
votes
[
arg
.
node
]
=
arg
.
sentiment
return
self
.
votes
def
get_vote
(
self
,
node
):
return
self
.
votes
[
node
]
def
get_arg_mentions
(
self
,
node
):
mentions
=
[]
for
arg
in
self
.
args
:
if
arg
.
node
==
node
:
start
,
end
=
self
.
spans
[
arg
.
start
][
0
],
self
.
spans
[
arg
.
end
-
1
][
1
]
mentions
.
append
((
arg
.
form
,
start
,
end
))
return
mentions
def
n_args
(
self
):
return
len
(
self
.
args
)
@
staticmethod
def
matching_subsequences
(
l_sub
,
l
):
sub_idxs
=
[]
...
...
@@ -114,8 +130,9 @@ class Phrase:
class
Arg
:
def
__init__
(
self
,
node
,
start
,
end
):
def
__init__
(
self
,
node
,
form
,
start
,
end
):
self
.
node
=
node
self
.
form
=
form
self
.
start
=
start
self
.
end
=
end
self
.
sentiment
=
None
...
...
ADA/server/agent/target_extraction/eval/eval.py
View file @
8416bbe7
...
...
@@ -14,6 +14,14 @@ def get_votes(df):
return
df
.
apply
(
lambda
row
:
row
[
'true'
]
>
row
[
'false'
],
axis
=
1
)
def
get_votes_for_product
(
product
):
for
i
in
range
(
n_raters
):
votes
=
get_votes
(
get_df
(
product
,
i
))
print
(
methods
[
i
])
print
(
votes
)
print
(
''
)
def
get_accuracy
(
df
):
votes
=
get_votes
(
df
)
return
sum
(
1
if
vote
else
0
for
vote
in
votes
)
/
len
(
votes
)
if
len
(
votes
)
>
0
else
None
...
...
@@ -45,10 +53,11 @@ def get_kappa():
return
(
p
-
pe
)
/
(
1
-
pe
)
# for i in range(3
):
#
print(i
)
#
for prod in products:
# print(' ', prod, len(get_df(prod, i))
)
# print(' ', sum(len(get_df(prod, i)) for prod in products))
def
get_agreement
(
):
df
=
pd
.
concat
(
get_df
(
prod
,
i
)
for
prod
in
products
for
i
in
range
(
3
)).
reset_index
(
drop
=
True
)
agreed
=
df
[
df
.
apply
(
lambda
row
:
row
[
'true'
]
==
n_raters
or
row
[
'false'
]
==
n_raters
,
axis
=
1
)]
return
len
(
agreed
)
/
len
(
df
)
print_accuracies
()
\ No newline at end of file
for
p
in
products
:
print
(
len
(
get_df
(
p
,
0
)))
\ No newline at end of file
ADA/server/agent/target_extraction/target_extractor.py
View file @
8416bbe7
...
...
@@ -6,7 +6,7 @@ from nltk.corpus import stopwords
from
nltk.stem
import
WordNetLemmatizer
import
string
from
gensim.models.phrases
import
Phrases
,
Phraser
from
anytree
import
Node
,
RenderTree
from
anytree
import
Node
,
RenderTree
,
PreOrderIter
import
numpy
as
np
import
re
from
gensim.models
import
Word2Vec
...
...
@@ -18,6 +18,7 @@ from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import Pai
from
agent.target_extraction.BERT.relation_extractor.bert_rel_extractor
import
BertRelExtractor
from
pathos.multiprocessing
import
ProcessingPool
as
Pool
import
itertools
from
time
import
time
np
.
set_printoptions
(
precision
=
4
,
threshold
=
np
.
inf
,
suppress
=
True
)
stop_words
=
stopwords
.
words
(
'english'
)
...
...
@@ -160,9 +161,11 @@ class TargetExtractor:
self
.
product
=
product
self
.
file_path
=
file_path
ts
=
time
()
print
(
'tokenizing phrases...'
)
# tokenize and normalize phrases
texts
=
TargetExtractor
.
obtain_texts
(
file_path
,
text_column
,
n
=
2
00000
)
texts
=
TargetExtractor
.
obtain_texts
(
file_path
,
text_column
,
n
=
5
00000
)
self
.
sentences
=
list
(
itertools
.
chain
.
from_iterable
(
pool
.
map
(
sent_tokenize
,
texts
)))
self
.
sentences
=
pool
.
map
(
lambda
s
:
s
.
replace
(
'_'
,
' '
).
lower
(),
self
.
sentences
)
self
.
phrases
=
pool
.
map
(
word_tokenize
,
self
.
sentences
)
...
...
@@ -179,19 +182,21 @@ class TargetExtractor:
self
.
counter
=
self
.
count_nouns
()
self
.
total_count
=
sum
(
self
.
counter
.
values
())
self
.
save
()
t_noun
=
time
()
print
(
'Noun extraction took {} seconds'
.
format
(
t_noun
-
ts
))
print
(
'mining aspects...'
)
# mine aspects
self
.
aspects
,
self
.
counts
=
self
.
get_aspects
(
self
.
counter
)
t_feature
=
time
()
print
(
'Feature extraction took {} seconds'
.
format
(
t_feature
-
t_noun
))
print
(
'training word2vec model...'
)
# train word2vec model
self
.
wv
=
self
.
get_word2vec_model
(
size
=
TargetExtractor
.
WV_SIZE
,
window
=
TargetExtractor
.
WV_WINDOW
,
min_count
=
TargetExtractor
.
MIN_TERM_COUNT
)
self
.
save
()
print
(
'mining aspects...'
)
# mine aspects
self
.
aspects
,
self
.
counts
=
self
.
get_aspects
(
self
.
counter
)
print
(
'extracting synonyms...'
)
# obtain synonyms
self
.
syn_dict
=
self
.
get_syn_dict
()
...
...
@@ -201,18 +206,19 @@ class TargetExtractor:
self
.
counts
=
{
aspect
:
sum
(
self
.
counts
[
syn
]
for
syn
in
self
.
syn_dict
[
aspect
])
for
aspect
in
self
.
aspects
}
self
.
aspects
=
sorted
(
self
.
aspects
,
key
=
self
.
counts
.
get
,
reverse
=
True
)
print
(
self
.
syn_dict
)
self
.
save
()
t_syn
=
time
()
print
(
'Synonym extraction took {} seconds'
.
format
(
t_syn
-
t_feature
))
print
(
'extracting relatedness matrix...'
)
self
.
relatedness_matrix
=
self
.
get_bert_relations
()
self
.
save
()
print
(
'extracting aspect tree...'
)
self
.
tree
=
self
.
get_product_tree3
()
te
=
time
()
print
(
'Ontology extraction took {} seconds'
.
format
(
te
-
t_syn
))
print
(
'Full process took {} seconds'
.
format
(
te
-
ts
))
print
(
'saving...'
)
self
.
save
()
...
...
@@ -508,8 +514,3 @@ class Synset:
if
w
in
group
:
return
group
return
None
# for p in ['mixer', 'game', 'necklace', 'watch', 'tv']:
# extr = TargetExtractor.load_saved(p)
# extr.save_product_representation()
ADA/server/ios_server/urls.py
View file @
8416bbe7
...
...
@@ -4,6 +4,8 @@ from . import views
urlpatterns
=
[
path
(
''
,
views
.
index
,
name
=
'index'
),
path
(
'products/'
,
views
.
products
,
name
=
'products'
),
path
(
'product/'
,
views
.
product
,
name
=
'product'
),
path
(
'message/'
,
views
.
message
,
name
=
'message'
)
path
(
'message/'
,
views
.
message
,
name
=
'message'
),
path
(
'arguments/'
,
views
.
arguments
,
name
=
'arguments'
)
]
ADA/server/ios_server/views.py
View file @
8416bbe7
...
...
@@ -4,46 +4,79 @@ import jsonpickle
from
django.views.decorators.csrf
import
csrf_exempt
from
agent.dataloader
import
DataLoader
from
agent.communicator
import
Communicator
from
time
import
time
communicators
=
[]
# change into dict with cookie key to support several connections
product_ids
=
[
'B0000TIKK8'
,
'B0000TIIPK'
,
'B000AYW0M2'
,
'B000AYW0KO'
,
'B004J30ERI'
,
'B004VR9HP2'
,
'B00005UP2N'
,
'B0001HLTTI'
,
'B00063ULMI'
,
'B00791QYMQ'
]
class
Empty
:
pass
def
index
(
request
):
return
HttpResponse
(
"OK"
)
def
products
(
request
):
product_infos
=
[]
for
product_id
in
product_ids
:
product_title
=
DataLoader
.
get_product_name
(
product_id
)
star_rating
=
DataLoader
.
get_avg_star_rating
(
product_id
)
image_url
=
'https://ws-na.amazon-adsystem.com/widgets/q?_encoding=UTF8&MarketPlace=US&ASIN='
+
product_id
+
'&ServiceVersion=20070822&ID=AsinImage&WS=1&Format=SL250'
product_info
=
Empty
()
product_info
.
id
=
product_id
product_info
.
name
=
product_title
product_info
.
starRating
=
star_rating
product_info
.
imageURL
=
image_url
product_infos
.
append
(
product_info
)
return
HttpResponse
(
jsonpickle
.
encode
(
product_infos
,
unpicklable
=
False
),
content_type
=
"application/json"
)
def
product
(
request
):
ts
=
time
()
product_id
=
request
.
GET
.
get
(
'id'
,
''
)
if
not
communicators
:
communicators
.
append
(
Communicator
(
product_id
))
if
communicators
:
communicators
.
pop
()
communicators
.
append
(
Communicator
(
product_id
))
communicator
=
communicators
[
0
]
init_message
=
communicator
.
get_init_message
()
print
(
'Request took {} seconds'
.
format
(
time
()
-
ts
))
return
HttpResponse
(
jsonpickle
.
encode
(
init_message
,
unpicklable
=
False
),
content_type
=
"application/json"
)