Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Joel Oksanen
individual_project
Commits
8416bbe7
Commit
8416bbe7
authored
Jun 21, 2020
by
Joel Oksanen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Implementation complete
parent
e38035ff
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
234 additions
and
86 deletions
+234
-86
ADA/server/agent/SA/bert_analyzer.py
ADA/server/agent/SA/bert_analyzer.py
+16
-3
ADA/server/agent/SA/bert_dataset.py
ADA/server/agent/SA/bert_dataset.py
+4
-2
ADA/server/agent/SA/tdbertnet.py
ADA/server/agent/SA/tdbertnet.py
+4
-5
ADA/server/agent/argument.py
ADA/server/agent/argument.py
+9
-0
ADA/server/agent/communicator.py
ADA/server/agent/communicator.py
+18
-4
ADA/server/agent/dataloader.py
ADA/server/agent/dataloader.py
+1
-1
ADA/server/agent/framework.py
ADA/server/agent/framework.py
+39
-17
ADA/server/agent/prep_metadata.py
ADA/server/agent/prep_metadata.py
+32
-5
ADA/server/agent/review.py
ADA/server/agent/review.py
+23
-6
ADA/server/agent/target_extraction/eval/eval.py
ADA/server/agent/target_extraction/eval/eval.py
+15
-6
ADA/server/agent/target_extraction/target_extractor.py
ADA/server/agent/target_extraction/target_extractor.py
+20
-19
ADA/server/ios_server/urls.py
ADA/server/ios_server/urls.py
+3
-1
ADA/server/ios_server/views.py
ADA/server/ios_server/views.py
+50
-17
No files found.
ADA/server/agent/SA/bert_analyzer.py
View file @
8416bbe7
...
...
@@ -8,6 +8,8 @@ import time
import
numpy
as
np
from
sklearn
import
metrics
device
=
torch
.
device
(
'cuda'
)
semeval_2014_test_path
=
'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path
=
'data/Amazon/annotated_amazon_laptop_reviews.xml'
...
...
@@ -43,13 +45,20 @@ class BertAnalyzer:
collate_fn
=
generate_batch
)
self
.
net
=
TDBertNet
(
len
(
polarity_indices
))
# initialise GPU
self
.
net
.
cuda
()
optimiser
=
optim
.
Adam
(
self
.
net
.
parameters
(),
lr
=
LEARNING_RATE
)
start
=
time
.
time
()
for
epoch
in
range
(
MAX_EPOCHS
):
batch_loss
=
0.0
for
i
,
(
texts
,
target_indices
,
labels
)
in
enumerate
(
train_loader
):
for
i
,
batch
in
enumerate
(
train_loader
):
# send batch to gpu
texts
,
target_indices
,
labels
=
tuple
(
i
.
to
(
device
)
for
i
in
batch
)
# zero param gradients
optimiser
.
zero_grad
()
...
...
@@ -103,10 +112,14 @@ class BertAnalyzer:
dataset
=
BertDataset
.
from_data
(
data
)
loader
=
DataLoader
(
dataset
,
batch_size
=
128
,
shuffle
=
False
,
num_workers
=
8
,
collate_fn
=
generate_batch
)
self
.
net
.
cuda
()
self
.
net
.
eval
()
predicted
=
[]
with
torch
.
no_grad
():
for
texts
,
target_indices
,
_
in
loader
:
outputs
,
attentions
=
self
.
net
(
texts
,
target_indices
)
for
input_ids
,
attn_mask
,
target_indices
,
_
in
loader
:
input_ids
,
attn_mask
,
target_indices
=
tuple
(
i
.
to
(
device
)
for
i
in
[
input_ids
,
attn_mask
,
target_indices
])
outputs
=
self
.
net
(
input_ids
,
attn_mask
,
target_indices
)
batch_val
,
batch_pred
=
torch
.
max
(
outputs
.
data
,
1
)
predicted
+=
[
BertAnalyzer
.
get_polarity
(
val
,
pred
)
for
val
,
pred
in
zip
(
batch_val
,
batch_pred
)]
...
...
ADA/server/agent/SA/bert_dataset.py
View file @
8416bbe7
...
...
@@ -12,9 +12,11 @@ MASK_TOKEN = '[MASK]'
def
generate_batch
(
batch
):
texts
=
tokenizer
.
batch_encode_plus
([
entry
[
'tokens'
]
for
entry
in
batch
],
add_special_tokens
=
True
,
encoded
=
tokenizer
.
batch_encode_plus
([
entry
[
'tokens'
]
for
entry
in
batch
],
add_special_tokens
=
True
,
max_length
=
MAX_SEQ_LEN
,
pad_to_max_length
=
True
,
is_pretokenized
=
True
,
return_tensors
=
'pt'
)
input_ids
=
encoded
[
'input_ids'
]
attn_mask
=
encoded
[
'attention_mask'
]
max_tg_len
=
max
(
entry
[
'to'
]
-
entry
[
'from'
]
for
entry
in
batch
)
target_indices
=
torch
.
tensor
([[[
min
(
t
,
entry
[
'to'
])]
*
HIDDEN_OUTPUT_FEATURES
...
...
@@ -23,7 +25,7 @@ def generate_batch(batch):
polarity_labels
=
torch
.
tensor
([
entry
[
'polarity'
]
for
entry
in
batch
])
return
texts
,
target_indices
,
polarity_labels
return
input_ids
,
attn_mask
,
target_indices
,
polarity_labels
def
token_for_char
(
char_idx
,
text
,
tokens
):
...
...
ADA/server/agent/SA/tdbertnet.py
View file @
8416bbe7
...
...
@@ -11,18 +11,17 @@ class TDBertNet(nn.Module):
def
__init__
(
self
,
num_class
):
super
(
TDBertNet
,
self
).
__init__
()
config
=
BertConfig
.
from_pretrained
(
TRAINED_WEIGHTS
,
output_attentions
=
True
)
config
=
BertConfig
.
from_pretrained
(
TRAINED_WEIGHTS
)
self
.
bert_base
=
BertModel
.
from_pretrained
(
TRAINED_WEIGHTS
,
config
=
config
)
self
.
bert_base
.
config
.
output_attentions
=
True
self
.
fc
=
nn
.
Linear
(
HIDDEN_OUTPUT_FEATURES
,
num_class
)
# n of hidden features, n of output labels
def
forward
(
self
,
texts
,
target_indices
):
def
forward
(
self
,
input_ids
,
attn_mask
,
target_indices
):
# BERT
bert_output
,
_
,
attentions
=
self
.
bert_base
(
**
texts
)
bert_output
,
_
=
self
.
bert_base
(
input_ids
=
input_ids
,
attention_mask
=
attn_mask
)
# max pooling at target locations
target_outputs
=
torch
.
gather
(
bert_output
,
dim
=
1
,
index
=
target_indices
)
pooled_output
=
torch
.
max
(
target_outputs
,
dim
=
1
)[
0
]
# fc layer with softmax activation
x
=
F
.
softmax
(
self
.
fc
(
pooled_output
),
1
)
return
x
,
attentions
[
-
1
]
return
x
ADA/server/agent/argument.py
0 → 100644
View file @
8416bbe7
class
Argument
:
def
__init__
(
self
,
text
,
polarity
,
supporters
,
attackers
,
phrase
,
size
):
self
.
text
=
text
self
.
polarity
=
'POS'
if
polarity
else
'NEG'
self
.
supporters
=
supporters
self
.
attackers
=
attackers
self
.
phrase
=
phrase
.
text
if
phrase
else
'-'
self
.
size
=
size
ADA/server/agent/communicator.py
View file @
8416bbe7
...
...
@@ -21,7 +21,7 @@ class Communicator:
def
get_init_message
(
self
):
prod_node
=
self
.
product
.
root
prod
=
self
.
product
.
argument_for_node
(
prod_node
)
text
=
ADAText
(
'What would you like to know about th
e
*?'
,
[
prod
.
name
])
text
=
ADAText
(
'What would you like to know about th
is
*?'
,
[
prod
.
name
])
queries
=
self
.
get_queries
(
prod_node
)
args
=
[
prod
.
with_queries
(
queries
)]
return
ADAMessage
(
text
,
args
)
...
...
@@ -75,9 +75,20 @@ class Communicator:
if
query_id
==
4
or
query_id
==
5
:
phrase
=
(
self
.
framework
.
best_supporting_phrase
(
q_arg_node
)
if
query_id
==
4
else
self
.
framework
.
best_attacking_phrase
(
q_arg_node
))
while
phrase
[
-
1
]
==
'.'
:
phrase
=
phrase
[:
-
1
]
text
=
ADAText
(
'
\"
...*...
\"
'
,
[
phrase
],
style
=
'QUOT'
)
template
=
''
args
=
[]
i
=
0
for
form
,
start
,
end
in
phrase
.
get_arg_mentions
(
q_arg_node
):
template
+=
phrase
.
text
[
i
:
start
]
+
'*'
i
=
end
args
.
append
(
form
)
template
+=
phrase
.
text
[
i
:
len
(
phrase
.
text
)]
while
template
[
-
1
]
==
'.'
:
template
=
template
[:
-
1
]
text
=
ADAText
(
'
\"
...{}...
\"
'
.
format
(
template
),
args
,
style
=
'QUOT'
)
args
=
[
q_arg_node
]
args
=
[
self
.
product
.
argument_for_node
(
arg
).
with_queries
(
self
.
get_queries
(
arg
))
for
arg
in
args
]
...
...
@@ -106,3 +117,6 @@ class Communicator:
def
was_were
(
self
,
arg_n
):
return
'was'
if
self
.
product
.
singularities
[
arg_n
]
else
'were'
def
get_argument_graph
(
self
):
return
self
.
framework
.
get_argument_graph
()
ADA/server/agent/dataloader.py
View file @
8416bbe7
...
...
@@ -2,7 +2,7 @@ import pandas as pd
class
DataLoader
:
data_location
=
'agent/amazon_data/
top_5_mixer_reviews_subset
.tsv'
data_location
=
'agent/amazon_data/
reviews
.tsv'
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
@
staticmethod
...
...
ADA/server/agent/framework.py
View file @
8416bbe7
from
anytree
import
PostOrderIter
from
anytree
import
PostOrderIter
,
PreOrderIter
from
functools
import
reduce
from
agent.SA.bert_analyzer
import
BertAnalyzer
from
agent.target_extraction.product
import
Product
from
agent.review
import
Review
from
agent.dataloader
import
DataLoader
from
agent.argument
import
Argument
import
pickle
import
re
from
time
import
time
class
Framework
:
...
...
@@ -19,9 +21,14 @@ class Framework:
self
.
arguments
=
self
.
product
.
argument_nodes
self
.
features
=
self
.
product
.
feature_nodes
ts
=
time
()
# get reviews
review_csv
=
DataLoader
.
get_reviews
(
product_id
)
reviews
=
[
Review
(
row
,
self
.
product
)
for
_
,
row
in
review_csv
.
iterrows
()]
reviews
=
[
Review
(
row
,
self
.
product
)
for
_
,
row
in
review_csv
.
head
(
1000
).
iterrows
()]
t_feature
=
time
()
print
(
'Feature detection took {} seconds'
.
format
(
t_feature
-
ts
))
# extract augmented votes
self
.
extract_votes
(
reviews
)
...
...
@@ -29,15 +36,25 @@ class Framework:
if
len
(
voting_reviews
)
/
len
(
reviews
)
<
0.33
:
print
(
'warning: only a small fraction of reviews generated votes'
)
t_sa
=
time
()
print
(
'Sentiment analysis took {} seconds'
.
format
(
t_sa
-
t_feature
))
# get aggregates
ra
,
self
.
vote_sum
,
self
.
vote_phrases
=
self
.
get_aggregates
(
reviews
)
t_ra
=
time
()
print
(
'Review aggregation took {} seconds'
.
format
(
t_ra
-
t_sa
))
# get qbaf from ra
self
.
qbaf
,
self
.
argument_polarities
=
self
.
get_qbaf
(
ra
,
len
(
reviews
))
# apply gradual semantics
self
.
strengths
=
self
.
get_strengths
(
self
.
qbaf
)
te
=
time
()
print
(
'QBAF construction took {} seconds'
.
format
(
te
-
t_ra
))
print
(
'Process took {} seconds'
.
format
(
te
-
ts
))
# save
self
.
save
()
...
...
@@ -78,7 +95,7 @@ class Framework:
for
review
in
reviews
:
for
phrase
in
review
.
phrases
:
for
arg
,
sentiment
in
phrase
.
get_votes
().
items
():
vote_phrases
[
arg
].
append
({
'phrase'
:
phrase
.
text
,
'sentiment'
:
sentiment
,
'n_args'
:
len
(
phrase
.
args
)}
)
vote_phrases
[
arg
].
append
(
phrase
)
#
{'phrase': phrase.text, 'sentiment': sentiment, 'n_args': len(phrase.args)}
for
arg
,
sentiment
in
review
.
get_votes
().
items
():
ra
.
append
({
'review_id'
:
review
.
id
,
'argument'
:
arg
,
'vote'
:
sentiment
})
vote_sum
[
arg
]
+=
sentiment
...
...
@@ -187,31 +204,36 @@ class Framework:
return
att
is
not
None
and
self
.
strengths
[
att
]
>
0
def
best_supporting_phrase
(
self
,
argument
):
phrases
=
{
vp
[
'
phrase
'
]:
vp
[
'sentiment'
]
for
vp
in
self
.
supporting_phrases
(
argument
)
if
vp
[
'
n_args
'
]
==
1
and
Framework
.
is_well_formatted
(
vp
[
'
phrase
'
])}
phrases
=
[
phrase
for
phrase
in
self
.
supporting_phrases
(
argument
)
if
phrase
.
n_args
()
==
1
and
Framework
.
is_well_formatted
(
phrase
.
text
)]
if
len
(
phrases
)
==
0
:
return
None
top_5
=
list
(
sorted
(
phrases
.
keys
(),
key
=
lambda
k
:
phrases
[
k
],
reverse
=
True
))[:
5
]
print
(
top_5
)
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
))
top_5
=
list
(
sorted
(
phrases
,
key
=
lambda
p
:
p
.
get_vote
(
argument
),
reverse
=
True
))[:
5
]
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
.
text
))
def
best_attacking_phrase
(
self
,
argument
):
phrases
=
{
vp
[
'
phrase
'
]:
vp
[
'sentiment'
]
for
vp
in
self
.
attacking_phrases
(
argument
)
if
vp
[
'
n_args
'
]
==
1
and
Framework
.
is_well_formatted
(
vp
[
'
phrase
'
])}
phrases
=
[
phrase
for
phrase
in
self
.
attacking_phrases
(
argument
)
if
phrase
.
n_args
()
==
1
and
Framework
.
is_well_formatted
(
phrase
.
text
)]
if
len
(
phrases
)
==
0
:
return
None
top_5
=
list
(
sorted
(
phrases
.
keys
(),
key
=
lambda
k
:
phrases
[
k
]))[:
5
]
print
(
top_5
)
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
))
top_5
=
list
(
sorted
(
phrases
,
key
=
lambda
p
:
p
.
get_vote
(
argument
)))[:
5
]
return
max
(
top_5
,
key
=
lambda
p
:
len
(
p
.
text
))
@
staticmethod
def
is_well_formatted
(
phrase
):
if
not
re
.
match
(
'^[-a-zA-Z0-9();,./!?
\'
" ]*$'
,
phrase
):
print
(
phrase
)
return
re
.
match
(
'^[-a-zA-Z0-9();,./!?
\'
" ]*$'
,
phrase
)
def
supporting_phrases
(
self
,
argument
):
return
list
(
filter
(
lambda
vp
:
vp
[
'senti
ment
'
]
>
0
,
self
.
vote_phrases
[
argument
]))
return
list
(
filter
(
lambda
phrase
:
phrase
.
get_vote
(
argu
ment
)
>
0
,
self
.
vote_phrases
[
argument
]))
def
attacking_phrases
(
self
,
argument
):
return
list
(
filter
(
lambda
vp
:
vp
[
'sentiment'
]
<
0
,
self
.
vote_phrases
[
argument
]))
return
list
(
filter
(
lambda
phrase
:
phrase
.
get_vote
(
argument
)
<
0
,
self
.
vote_phrases
[
argument
]))
def
get_argument_graph
(
self
):
return
self
.
create_arg
(
self
.
product_node
,
120
)
def
create_arg
(
self
,
arg_node
,
size
):
supporters
=
[
self
.
create_arg
(
supp_node
,
size
-
20
)
for
supp_node
in
self
.
qbaf
[
'supporters'
][
arg_node
]]
attackers
=
[
self
.
create_arg
(
att_node
,
size
-
20
)
for
att_node
in
self
.
qbaf
[
'attackers'
][
arg_node
]]
phrase
=
self
.
best_supporting_phrase
(
arg_node
)
if
self
.
argument_polarities
[
arg_node
]
else
self
.
best_attacking_phrase
(
arg_node
)
return
Argument
(
arg_node
.
name
,
self
.
argument_polarities
[
arg_node
],
supporters
,
attackers
,
phrase
,
size
)
ADA/server/agent/prep_metadata.py
View file @
8416bbe7
import
pandas
as
pd
pd
.
set_option
(
'display.max_colwidth'
,
None
)
all_reviews_file
=
'amazon_data/reviews.tsv'
def
get_reviews
(
category
,
meta_file
,
review_file
):
...
...
@@ -21,7 +22,7 @@ def save_reviews(category, meta_file, review_file, output_file):
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
def
save_top_reviewed_products
(
n
,
output_file
,
product_title
,
category
=
None
,
review_file
=
None
,
meta_file
=
None
,
def
save_top_reviewed_products
(
n
,
category
=
None
,
review_file
=
None
,
meta_file
=
None
,
product_file
=
None
):
if
product_file
:
reviews
=
pd
.
read_csv
(
product_file
,
sep
=
'
\t
'
)
...
...
@@ -33,9 +34,35 @@ def save_top_reviewed_products(n, output_file, product_title, category=None, rev
'reviewText'
:
'review_body'
})
reviews
=
reviews
[
reviews
[
'review_body'
].
apply
(
lambda
b
:
not
pd
.
isna
(
b
)
and
len
(
b
)
>
0
)]
reviews
=
reviews
[
reviews
[
'star_rating'
].
apply
(
lambda
r
:
type
(
r
)
is
int
or
r
.
isdigit
())]
reviews
[
'product_title'
]
=
product_title
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
titles
=
{
asin
:
input
(
'Product title for {}: '
.
format
(
asin
))
for
asin
in
reviews
[
'product_id'
].
unique
()}
reviews
[
'product_title'
]
=
reviews
[
'product_id'
].
apply
(
lambda
asin
:
titles
[
asin
])
all_reviews
=
pd
.
read_csv
(
all_reviews_file
,
sep
=
'
\t
'
)
all_reviews
=
pd
.
concat
([
all_reviews
,
reviews
])
all_reviews
.
to_csv
(
all_reviews_file
,
sep
=
'
\t
'
,
index
=
False
)
def
get_top_products_by_brand
(
n
,
brand
,
meta_file
,
review_file
):
metadata_iter
=
pd
.
read_json
(
meta_file
,
lines
=
True
,
chunksize
=
1000
)
metadata
=
pd
.
concat
([
metadata
[
metadata
[
'brand'
].
apply
(
lambda
b
:
type
(
b
)
is
str
and
b
==
brand
)]
for
metadata
in
metadata_iter
])
print
(
len
(
metadata
.
index
))
print
(
metadata
.
head
())
print
(
metadata
.
columns
)
save_top_reviewed_products
(
5
,
'amazon_data/top_5_mixer_reviews.tsv'
,
'KitchenAid KSM150PSGR Artisan Series 5-Qt. Stand Mixer with Pouring Shield - Imperial Grey'
,
product_file
=
'target_extraction/data/verified_stand_mixer_reviews.tsv'
)
review_iter
=
pd
.
read_json
(
review_file
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'asin'
].
isin
(
metadata
[
'asin'
])]
for
reviews
in
review_iter
])
print
(
len
(
reviews
.
index
))
top_reviewed
=
reviews
.
groupby
([
'asin'
],
sort
=
False
).
size
().
sort_values
(
ascending
=
False
).
head
(
n
)
return
top_reviewed
def
get_product_reviews_for_asin
(
asin
,
review_file
,
output_file
):
review_iter
=
pd
.
read_json
(
review_file
,
lines
=
True
,
chunksize
=
1000
)
reviews
=
pd
.
concat
([
reviews
[
reviews
[
'asin'
].
apply
(
lambda
p_asin
:
p_asin
==
asin
)]
for
reviews
in
review_iter
])
print
(
len
(
reviews
.
index
))
reviews
.
to_csv
(
output_file
,
sep
=
'
\t
'
,
index
=
False
)
ADA/server/agent/review.py
View file @
8416bbe7
import
re
from
nltk.tokenize
import
w
ord
_t
okenize
,
sent_tokenize
from
nltk.tokenize
import
TreebankW
ord
T
okenize
r
,
sent_tokenize
from
agent.SA.bert_dataset
import
MAX_SEQ_LEN
from
anytree
import
PostOrderIter
from
nltk.stem
import
WordNetLemmatizer
wnl
=
WordNetLemmatizer
()
tokenizer
=
TreebankWordTokenizer
()
class
Review
:
...
...
@@ -61,7 +62,8 @@ class Phrase:
def
__init__
(
self
,
text
,
product
):
self
.
product
=
product
self
.
text
=
text
self
.
tokens
=
[
word
.
lower
()
for
word
in
word_tokenize
(
text
)]
self
.
spans
=
list
(
tokenizer
.
span_tokenize
(
text
))
self
.
tokens
=
[
text
[
start
:
end
]
for
start
,
end
in
self
.
spans
]
self
.
args
=
self
.
get_args
()
self
.
votes
=
{}
...
...
@@ -72,7 +74,7 @@ class Phrase:
while
len
(
arguments
)
>
0
:
arg
=
arguments
.
pop
(
0
)
for
term
in
self
.
product
.
glossary
[
arg
]:
matches
=
[
Arg
(
arg
,
start
,
end
)
matches
=
[
Arg
(
arg
,
' '
.
join
(
term
),
start
,
end
)
for
start
,
end
in
Phrase
.
matching_subsequences
(
term
,
self
.
tokens
)]
if
matches
:
argument_matches
+=
matches
...
...
@@ -89,8 +91,8 @@ class Phrase:
pass
self
.
remove_ancestors
(
node
.
parent
,
l
)
def
add_arg
(
self
,
arg
):
self
.
args
.
append
(
arg
)
#
def add_arg(self, arg):
#
self.args.append(arg)
def
num_args
(
self
):
return
len
(
self
.
args
)
...
...
@@ -102,6 +104,20 @@ class Phrase:
self
.
votes
[
arg
.
node
]
=
arg
.
sentiment
return
self
.
votes
def
get_vote
(
self
,
node
):
return
self
.
votes
[
node
]
def
get_arg_mentions
(
self
,
node
):
mentions
=
[]
for
arg
in
self
.
args
:
if
arg
.
node
==
node
:
start
,
end
=
self
.
spans
[
arg
.
start
][
0
],
self
.
spans
[
arg
.
end
-
1
][
1
]
mentions
.
append
((
arg
.
form
,
start
,
end
))
return
mentions
def
n_args
(
self
):
return
len
(
self
.
args
)
@
staticmethod
def
matching_subsequences
(
l_sub
,
l
):
sub_idxs
=
[]
...
...
@@ -114,8 +130,9 @@ class Phrase:
class
Arg
:
def
__init__
(
self
,
node
,
start
,
end
):
def
__init__
(
self
,
node
,
form
,
start
,
end
):
self
.
node
=
node
self
.
form
=
form
self
.
start
=
start
self
.
end
=
end
self
.
sentiment
=
None
...
...
ADA/server/agent/target_extraction/eval/eval.py
View file @
8416bbe7
...
...
@@ -14,6 +14,14 @@ def get_votes(df):
return
df
.
apply
(
lambda
row
:
row
[
'true'
]
>
row
[
'false'
],
axis
=
1
)
def
get_votes_for_product
(
product
):
for
i
in
range
(
n_raters
):
votes
=
get_votes
(
get_df
(
product
,
i
))
print
(
methods
[
i
])
print
(
votes
)
print
(
''
)
def
get_accuracy
(
df
):
votes
=
get_votes
(
df
)
return
sum
(
1
if
vote
else
0
for
vote
in
votes
)
/
len
(
votes
)
if
len
(
votes
)
>
0
else
None
...
...
@@ -45,10 +53,11 @@ def get_kappa():
return
(
p
-
pe
)
/
(
1
-
pe
)
# for i in range(3
):
#
print(i
)
#
for prod in products:
# print(' ', prod, len(get_df(prod, i))
)
# print(' ', sum(len(get_df(prod, i)) for prod in products))
def
get_agreement
(
):
df
=
pd
.
concat
(
get_df
(
prod
,
i
)
for
prod
in
products
for
i
in
range
(
3
)).
reset_index
(
drop
=
True
)
agreed
=
df
[
df
.
apply
(
lambda
row
:
row
[
'true'
]
==
n_raters
or
row
[
'false'
]
==
n_raters
,
axis
=
1
)]
return
len
(
agreed
)
/
len
(
df
)
print_accuracies
()
\ No newline at end of file
for
p
in
products
:
print
(
len
(
get_df
(
p
,
0
)))
\ No newline at end of file
ADA/server/agent/target_extraction/target_extractor.py
View file @
8416bbe7
...
...
@@ -6,7 +6,7 @@ from nltk.corpus import stopwords
from
nltk.stem
import
WordNetLemmatizer
import
string
from
gensim.models.phrases
import
Phrases
,
Phraser
from
anytree
import
Node
,
RenderTree
from
anytree
import
Node
,
RenderTree
,
PreOrderIter
import
numpy
as
np
import
re
from
gensim.models
import
Word2Vec
...
...
@@ -18,6 +18,7 @@ from agent.target_extraction.BERT.relation_extractor.pair_rel_dataset import Pai
from
agent.target_extraction.BERT.relation_extractor.bert_rel_extractor
import
BertRelExtractor
from
pathos.multiprocessing
import
ProcessingPool
as
Pool
import
itertools
from
time
import
time
np
.
set_printoptions
(
precision
=
4
,
threshold
=
np
.
inf
,
suppress
=
True
)
stop_words
=
stopwords
.
words
(
'english'
)
...
...
@@ -160,9 +161,11 @@ class TargetExtractor:
self
.
product
=
product
self
.
file_path
=
file_path
ts
=
time
()
print
(
'tokenizing phrases...'
)
# tokenize and normalize phrases
texts
=
TargetExtractor
.
obtain_texts
(
file_path
,
text_column
,
n
=
2
00000
)
texts
=
TargetExtractor
.
obtain_texts
(
file_path
,
text_column
,
n
=
5
00000
)
self
.
sentences
=
list
(
itertools
.
chain
.
from_iterable
(
pool
.
map
(
sent_tokenize
,
texts
)))
self
.
sentences
=
pool
.
map
(
lambda
s
:
s
.
replace
(
'_'
,
' '
).
lower
(),
self
.
sentences
)
self
.
phrases
=
pool
.
map
(
word_tokenize
,
self
.
sentences
)
...
...
@@ -179,19 +182,21 @@ class TargetExtractor:
self
.
counter
=
self
.
count_nouns
()
self
.
total_count
=
sum
(
self
.
counter
.
values
())
self
.
save
()
t_noun
=
time
()
print
(
'Noun extraction took {} seconds'
.
format
(
t_noun
-
ts
))
print
(
'mining aspects...'
)
# mine aspects
self
.
aspects
,
self
.
counts
=
self
.
get_aspects
(
self
.
counter
)
t_feature
=
time
()
print
(
'Feature extraction took {} seconds'
.
format
(
t_feature
-
t_noun
))
print
(
'training word2vec model...'
)
# train word2vec model
self
.
wv
=
self
.
get_word2vec_model
(
size
=
TargetExtractor
.
WV_SIZE
,
window
=
TargetExtractor
.
WV_WINDOW
,
min_count
=
TargetExtractor
.
MIN_TERM_COUNT
)
self
.
save
()
print
(
'mining aspects...'
)
# mine aspects
self
.
aspects
,
self
.
counts
=
self
.
get_aspects
(
self
.
counter
)
print
(
'extracting synonyms...'
)
# obtain synonyms
self
.
syn_dict
=
self
.
get_syn_dict
()
...
...
@@ -201,18 +206,19 @@ class TargetExtractor:
self
.
counts
=
{
aspect
:
sum
(
self
.
counts
[
syn
]
for
syn
in
self
.
syn_dict
[
aspect
])
for
aspect
in
self
.
aspects
}
self
.
aspects
=
sorted
(
self
.
aspects
,
key
=
self
.
counts
.
get
,
reverse
=
True
)
print
(
self
.
syn_dict
)
self
.
save
()
t_syn
=
time
()
print
(
'Synonym extraction took {} seconds'
.
format
(
t_syn
-
t_feature
))
print
(
'extracting relatedness matrix...'
)
self
.
relatedness_matrix
=
self
.
get_bert_relations
()
self
.
save
()
print
(
'extracting aspect tree...'
)
self
.
tree
=
self
.
get_product_tree3
()
te
=
time
()
print
(
'Ontology extraction took {} seconds'
.
format
(
te
-
t_syn
))
print
(
'Full process took {} seconds'
.
format
(
te
-
ts
))
print
(
'saving...'
)
self
.
save
()
...
...
@@ -508,8 +514,3 @@ class Synset:
if
w
in
group
:
return
group
return
None
# for p in ['mixer', 'game', 'necklace', 'watch', 'tv']:
# extr = TargetExtractor.load_saved(p)
# extr.save_product_representation()
ADA/server/ios_server/urls.py
View file @
8416bbe7
...
...
@@ -4,6 +4,8 @@ from . import views
urlpatterns
=
[
path
(
''
,
views
.
index
,
name
=
'index'
),
path
(
'products/'
,
views
.
products
,
name
=
'products'
),
path
(
'product/'
,
views
.
product
,
name
=
'product'
),
path
(
'message/'
,
views
.
message
,
name
=
'message'
)
path
(
'message/'
,
views
.
message
,
name
=
'message'
),
path
(
'arguments/'
,
views
.
arguments
,
name
=
'arguments'
)
]
ADA/server/ios_server/views.py
View file @
8416bbe7
...
...
@@ -4,46 +4,79 @@ import jsonpickle
from
django.views.decorators.csrf
import
csrf_exempt
from
agent.dataloader
import
DataLoader
from
agent.communicator
import
Communicator
from
time
import
time
communicators
=
[]
# change into dict with cookie key to support several connections
product_ids
=
[
'B0000TIKK8'
,
'B0000TIIPK'
,
'B000AYW0M2'
,
'B000AYW0KO'
,
'B004J30ERI'
,
'B004VR9HP2'
,
'B00005UP2N'
,
'B0001HLTTI'
,
'B00063ULMI'
,
'B00791QYMQ'
]
class
Empty
:
pass
def
index
(
request
):
return
HttpResponse
(
"OK"
)
def
products
(
request
):
product_infos
=
[]
for
product_id
in
product_ids
:
product_title
=
DataLoader
.
get_product_name
(
product_id
)
star_rating
=
DataLoader
.
get_avg_star_rating
(
product_id
)
image_url
=
'https://ws-na.amazon-adsystem.com/widgets/q?_encoding=UTF8&MarketPlace=US&ASIN='
+
product_id
+
'&ServiceVersion=20070822&ID=AsinImage&WS=1&Format=SL250'
product_info
=
Empty
()
product_info
.
id
=
product_id
product_info
.
name
=
product_title
product_info
.
starRating
=
star_rating
product_info
.
imageURL
=
image_url
product_infos
.
append
(
product_info
)
return
HttpResponse
(
jsonpickle
.
encode
(
product_infos
,
unpicklable
=
False
),
content_type
=
"application/json"
)
def
product
(
request
):
ts
=
time
()
product_id
=
request
.
GET
.
get
(
'id'
,
''
)
if
not
communicators
:
communicators
.
append
(
Communicator
(
product_id
))
if
communicators
:
communicators
.
pop
()
communicators
.
append
(
Communicator
(
product_id
))
communicator
=
communicators
[
0
]
init_message
=
communicator
.
get_init_message
()
print
(
'Request took {} seconds'
.
format
(
time
()
-
ts
))
return
HttpResponse
(
jsonpickle
.
encode
(
init_message
,
unpicklable
=
False
),
content_type
=
"application/json"
)
@
csrf_exempt
def
message
(
request
):
<