Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
16cb2dcc
Commit
16cb2dcc
authored
Apr 11, 2020
by
Joel Oksanen
Browse files
Changed review annotation to arrow key based
parent
52741466
Changes
2
Hide whitespace changes
Inline
Side-by-side
ADA/SA/bert_analyzer.py
View file @
16cb2dcc
...
...
@@ -28,8 +28,8 @@ class BertAnalyzer:
self
.
net
.
load_state_dict
(
torch
.
load
(
trained_model_path
))
self
.
net
.
eval
()
def
train
(
self
):
train_data
=
BertDataset
(
semeval_2014_train_path
)
def
train
(
self
,
dataset
):
train_data
=
BertDataset
(
dataset
)
train_loader
=
DataLoader
(
train_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
num_workers
=
4
,
collate_fn
=
generate_batch
)
...
...
@@ -65,8 +65,8 @@ class BertAnalyzer:
torch
.
save
(
net
.
state_dict
(),
trained_model_path
)
def
evaluate
(
self
):
test_data
=
BertDataset
(
semeval_2014_test_path
)
def
evaluate
(
self
,
dataset
):
test_data
=
BertDataset
(
dataset
)
test_loader
=
DataLoader
(
test_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
False
,
num_workers
=
4
,
collate_fn
=
generate_batch
)
...
...
@@ -93,4 +93,4 @@ class BertAnalyzer:
sentiment_analyzer
=
BertAnalyzer
()
sentiment_analyzer
.
load_saved
()
sentiment_analyzer
.
evaluate
()
\ No newline at end of file
sentiment_analyzer
.
evaluate
(
semeval_2014_test_path
)
\ No newline at end of file
ADA/review_annotation.py
View file @
16cb2dcc
...
...
@@ -8,6 +8,9 @@ import nltk.data
from
stanfordcorenlp
import
StanfordCoreNLP
from
nltk.tree
import
ParentedTree
as
Tree
import
re
import
readchar
from
sty
import
fg
,
bg
,
ef
,
rs
from
wcwidth
import
wcswidth
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location
=
'reviews_to_be_annotated.xml'
...
...
@@ -15,6 +18,7 @@ min_characters = 0
max_characters
=
200
n
=
500
sentiment_mappings
=
{
'+'
:
'positive'
,
'0'
:
'neutral'
,
'-'
:
'negative'
,
'c'
:
'conflict'
}
ann_bgs
=
{
'positive'
:
bg
.
green
,
'neutral'
:
bg
.
li_black
,
'negative'
:
bg
.
red
,
'conflict'
:
bg
.
yellow
}
annotated_reviews_location
=
'annotated_camera_reviews.xml'
included_labels
=
[
'NN'
,
'NNS'
,
'NP'
,
'NNP'
,
'NNPS'
,
'DT'
,
'CD'
,
'FW'
,
'PRP$'
]
nouns
=
[
'NN'
,
'NNS'
,
'NP'
,
'NNP'
,
'NNPS'
]
...
...
@@ -23,6 +27,7 @@ prepared_reviews_location = 'prepared_amazon_camera_reviews.xml'
tokenizer
=
TweetTokenizer
()
sent_tokenizer
=
nltk
.
data
.
load
(
'tokenizers/punkt/english.pickle'
)
class
bcolors
:
HEADER
=
'
\033
[95m'
OKBLUE
=
'
\033
[94m'
...
...
@@ -33,6 +38,7 @@ class bcolors:
BOLD
=
'
\033
[1m'
UNDERLINE
=
'
\033
[4m'
def
get_leaf_indices
(
tree
,
phrase_tree
):
phrase_tree_pos
=
phrase_tree
.
treeposition
()
start
=
0
...
...
@@ -43,26 +49,31 @@ def get_leaf_indices(tree, phrase_tree):
end
+=
1
return
(
start
,
end
)
# true if r1 contains r2
def
range_contains
(
r1
,
r2
):
return
r1
[
0
]
<=
r2
[
0
]
and
r1
[
1
]
>=
r2
[
1
]
and
Tree
.
fromstring
(
r2
[
2
])
in
Tree
.
fromstring
(
r1
[
2
]).
subtrees
()
def
in_range
(
r
,
n
):
return
r
[
0
]
<=
n
and
r
[
1
]
>=
n
# true if rs cover r
def
range_cover
(
r
,
rs
):
for
n
in
range
(
r
[
0
],
r
[
1
]
+
1
):
for
n
in
range
(
r
[
0
],
r
[
1
]
+
1
):
if
not
any
(
in_range
(
other_r
,
n
)
for
other_r
in
rs
):
return
False
return
False
return
True
def
is_opinion_target
(
tree
):
return
(
tree
.
label
()
in
included_labels
and
all
(
sub
.
label
()
in
included_labels
or
(
sub
.
label
()
==
'PRP'
and
sub
[
0
].
lower
()
==
'it'
)
for
sub
in
tree
.
subtrees
()))
def
prepare_reviews
():
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
...
...
@@ -71,13 +82,13 @@ def prepare_reviews():
# try to filter out reviews for camera accessories
filter_words
=
[
'accessor'
,
'battery'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
filter_pat
=
''
for
word
in
filter_words
:
word_filter
=
'['
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
']'
+
word
[
1
:]
filter_pat
+=
word_filter
+
'|'
filter_pat
=
filter_pat
[:
-
1
]
reviews
=
reviews
[
~
reviews
[
'product_title'
].
str
.
contains
(
pat
=
filter_pat
,
regex
=
True
)]
reviews
=
reviews
[
~
reviews
[
'product_title'
].
str
.
contains
(
pat
=
filter_pat
,
regex
=
True
)]
# shuffle reviews
reviews
=
reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
...
...
@@ -121,7 +132,7 @@ def prepare_reviews():
parse_tree_node
.
text
=
parse_tree_str
tokenized_text_node
=
SubElement
(
sentence_node
,
'tokenized_text'
)
tokenized_text_node
.
text
=
' '
.
join
(
parse_tree
.
leaves
()).
replace
(
'``'
,
'""'
)
tokenized_text_node
.
text
=
' '
.
join
(
parse_tree
.
leaves
()).
replace
(
'``'
,
'""'
)
# save tree to file
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
' '
)
...
...
@@ -131,6 +142,7 @@ def prepare_reviews():
print
(
'Obtained and parsed'
,
len
(
reviews
),
'reviews'
)
def
annotate_reviews
():
row_character_count
=
100
reviews
=
parse
(
selected_reviews_location
)
...
...
@@ -143,69 +155,69 @@ def annotate_reviews():
for
review
in
not_annotated
:
for
sentence
in
review
.
find
(
'sentences'
):
tokens
=
sentence
.
find
(
'tokenized_text'
).
text
.
split
(
' '
)
os
.
system
(
'clear'
)
print
(
bcolors
.
OKBLUE
+
'{} reviews annotated'
.
format
(
n_annotated
)
+
bcolors
.
ENDC
)
print
(
''
)
print
(
bcolors
.
OKBLUE
+
'annotation: [
\'
i
\'
|
\'
n,m
\'
] [
\'
+
\'
|
\'
0
\'
|
\'
-
\'
|
\'
c
\'
]'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'next:
\'
n
\'
'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'skip:
\'
s
\'
'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'quit:
\'
q
\'
'
+
bcolors
.
ENDC
)
print
(
''
)
product_title
=
review
.
find
(
'product_title'
).
text
print
(
bcolors
.
OKGREEN
+
product_title
+
bcolors
.
ENDC
)
print
(
''
)
index_row
=
''
text_row
=
''
for
t
in
range
(
len
(
tokens
)):
space
=
len
(
tokens
[
t
])
-
len
(
str
(
t
))
token_text
=
tokens
[
t
]
+
' '
index_text
=
str
(
t
)
+
' '
if
space
>
0
:
index_text
=
' '
*
math
.
floor
(
space
/
2
)
+
index_text
+
' '
*
math
.
ceil
(
space
/
2
)
elif
space
<
0
:
space
=
abs
(
space
)
token_text
=
' '
*
math
.
floor
(
space
/
2
)
+
token_text
+
' '
*
math
.
ceil
(
space
/
2
)
index_row
+=
index_text
text_row
+=
token_text
if
t
+
1
<
len
(
tokens
)
and
len
(
index_row
)
+
len
(
tokens
[
t
+
1
])
+
1
>
row_character_count
:
print
(
bcolors
.
WARNING
+
index_row
+
bcolors
.
ENDC
)
print
(
text_row
)
index_row
=
''
text_row
=
''
print
(
bcolors
.
WARNING
+
index_row
+
bcolors
.
ENDC
)
print
(
text_row
)
print
(
''
)
text
=
sentence
.
find
(
'text'
).
text
cursor_pos
=
0
start
=
None
end
=
None
annotations
=
[]
while
True
:
task
=
input
(
': '
)
if
len
(
task
.
split
(
' '
))
==
2
:
rng
=
None
sentiment
=
''
fst
=
task
.
split
(
' '
)[
0
]
if
fst
.
isdigit
():
rng
=
(
int
(
fst
),
int
(
fst
))
elif
(
','
in
fst
and
len
(
fst
.
split
(
','
))
==
2
and
fst
.
split
(
','
)[
0
].
isdigit
()
and
fst
.
split
(
','
)[
1
].
isdigit
()):
rng
=
(
int
(
fst
.
split
(
','
)[
0
]),
int
(
fst
.
split
(
','
)[
1
]))
snd
=
task
.
split
(
' '
)[
1
]
if
snd
in
sentiment_mappings
.
keys
():
sentiment
=
snd
if
rng
and
sentiment
:
annotations
.
append
((
rng
,
sentiment
))
os
.
system
(
'clear'
)
print
(
bcolors
.
OKBLUE
+
'{} reviews annotated'
.
format
(
n_annotated
)
+
bcolors
.
ENDC
)
print
(
''
)
print
(
bcolors
.
OKBLUE
+
'next:
\'
n
\'
'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'skip:
\'
s
\'
'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'quit:
\'
q
\'
'
+
bcolors
.
ENDC
)
print
(
''
)
product_title
=
review
.
find
(
'product_title'
).
text
print
(
bcolors
.
OKGREEN
+
product_title
+
bcolors
.
ENDC
)
print
(
''
)
text_row
=
''
for
t
in
range
(
len
(
text
)):
char
=
text
[
t
]
if
t
==
cursor_pos
:
char
=
bg
.
blue
+
char
+
bg
.
rs
for
ann
in
annotations
:
if
t
in
range
(
ann
[
0
][
0
],
ann
[
0
][
1
]):
char
=
ann_bgs
[
ann
[
1
]]
+
char
+
bg
.
rs
text_row
+=
char
if
(
t
+
1
)
%
row_character_count
==
0
:
print
(
text_row
)
text_row
=
''
print
(
text_row
)
print
(
''
)
task
=
readchar
.
readkey
()
if
task
==
readchar
.
key
.
RIGHT
:
cursor_pos
=
min
(
cursor_pos
+
1
,
len
(
text
)
-
1
)
if
task
==
readchar
.
key
.
LEFT
:
cursor_pos
=
max
(
cursor_pos
-
1
,
0
)
if
task
==
readchar
.
key
.
DOWN
:
cursor_pos
=
min
(
cursor_pos
+
row_character_count
,
len
(
text
)
-
1
)
if
task
==
readchar
.
key
.
UP
:
cursor_pos
=
max
(
cursor_pos
-
row_character_count
,
0
)
if
task
==
readchar
.
key
.
SPACE
:
if
start
==
None
:
start
=
cursor_pos
elif
end
==
None
and
cursor_pos
>=
start
:
end
=
cursor_pos
+
1
rng
=
(
start
,
end
)
while
True
:
inp
=
input
(
'Sentiment for {},{}: '
.
format
(
start
,
end
-
1
))
if
inp
in
sentiment_mappings
.
keys
():
annotations
.
append
((
rng
,
sentiment_mappings
[
inp
]))
start
=
None
end
=
None
cursor_pos
=
min
(
cursor_pos
+
1
,
len
(
text
)
-
1
)
break
if
task
in
[
'n'
,
's'
,
'q'
]:
if
task
in
[
'n'
]:
...
...
@@ -216,7 +228,7 @@ def annotate_reviews():
range_node
=
SubElement
(
annotation_node
,
'range'
)
range_node
.
text
=
'{},{}'
.
format
(
annotation
[
0
][
0
],
annotation
[
0
][
1
])
sent_node
=
SubElement
(
annotation_node
,
'sentiment'
)
sent_node
.
text
=
sentiment_mappings
[
annotation
[
1
]
]
sent_node
.
text
=
annotation
[
1
]
break
if
task
==
'q'
:
break
...
...
@@ -232,6 +244,7 @@ def annotate_reviews():
with
open
(
selected_reviews_location
,
'w'
)
as
f
:
f
.
write
(
xmlstr
)
def
longest_common_subsequence
(
x
,
y
):
seq
=
[]
for
i
in
range
(
min
(
len
(
x
),
len
(
y
))):
...
...
@@ -241,6 +254,7 @@ def longest_common_subsequence(x, y):
return
tuple
(
seq
)
def
labelled_tree_str
(
tree_str
,
start
,
end
):
tree
=
Tree
.
fromstring
(
tree_str
)
start_pos
=
tree
.
leaf_treeposition
(
start
)
...
...
@@ -248,7 +262,7 @@ def labelled_tree_str(tree_str, start, end):
# find highest parent node common to start and end
if
start
==
end
:
parent_pos
=
start_pos
[:
len
(
start_pos
)
-
1
]
parent_pos
=
start_pos
[:
len
(
start_pos
)
-
1
]
else
:
parent_pos
=
longest_common_subsequence
(
start_pos
,
end_pos
)
parent_node
=
tree
[
parent_pos
]
...
...
@@ -257,7 +271,7 @@ def labelled_tree_str(tree_str, start, end):
parent_pos
=
parent_pos
[:
len
(
parent_pos
)
-
1
]
# remove branches between start and end inclusive
child_index_rng
=
range
(
start_pos
[
len
(
parent_pos
)],
end_pos
[
len
(
parent_pos
)]
+
1
)
child_index_rng
=
range
(
start_pos
[
len
(
parent_pos
)],
end_pos
[
len
(
parent_pos
)]
+
1
)
child_positions
=
[
list
(
parent_pos
)
+
[
i
]
for
i
in
child_index_rng
]
children_to_remove
=
[
tree
[
tuple
(
child_pos
)]
for
child_pos
in
child_positions
]
for
child
in
children_to_remove
:
...
...
@@ -268,27 +282,32 @@ def labelled_tree_str(tree_str, start, end):
return
str
(
tree
)
def
prepare_annotated_reviews
():
reviews
=
parse
(
selected_reviews_location
)
root
=
reviews
.
getroot
()
annotated
=
[
review
for
review
in
root
if
review
.
attrib
[
'annotated'
]
==
'true'
]
prepared_root
=
Element
(
'
data
'
)
prepared_root
=
Element
(
'
sentences
'
)
for
review
in
annotated
:
for
sentence
in
review
.
find
(
'sentences'
):
text
=
sentence
.
find
(
'text'
).
text
tree_str
=
sentence
.
find
(
'parse_tree'
).
text
annotations
=
sentence
.
find
(
'annotations'
)
if
sentence
.
find
(
'annotations'
)
else
[]
for
annotation
in
annotations
:
instance_node
=
SubElement
(
prepared_root
,
'instance'
)
text_node
=
SubElement
(
instance_node
,
'text'
)
text_node
.
text
=
text
op_node
=
SubElement
(
instance_node
,
'opinion'
)
op_node
.
text
=
annotation
.
find
(
'sentiment'
).
text
tree_node
=
SubElement
(
instance_node
,
'tree'
)
start
,
end
=
annotation
.
find
(
'range'
).
text
.
split
(
','
)
tree_node
.
text
=
labelled_tree_str
(
tree_str
,
int
(
start
),
int
(
end
))
sentence_node
=
SubElement
(
prepared_root
,
'sentence'
)
text_node
=
SubElement
(
sentence_node
,
'text'
)
text_node
.
text
=
text
if
sentence
.
find
(
'annotations'
):
aspect_terms_node
=
SubElement
(
sentence_node
,
'aspectTerms'
)
for
annotation
in
sentence
.
find
(
'annotations'
):
start
,
end
=
annotation
.
find
(
'range'
).
text
.
split
(
','
)
aspect_term_node
=
SubElement
(
aspect_terms_node
,
'aspectTerm'
)
aspect_term_node
.
set
(
'term'
,
text
[
start
:
end
])
aspect_term_node
.
set
(
'polarity'
,
annotation
.
find
(
'sentiment'
).
text
)
aspect_term_node
.
set
(
'from'
,
start
)
aspect_term_node
.
set
(
'to'
,
end
)
train_count
=
1000
train_root
=
Element
(
'data'
)
...
...
@@ -316,6 +335,7 @@ def prepare_annotated_reviews():
with
open
(
'amazon_camera_test.xml'
,
'w'
)
as
f
:
f
.
write
(
xmlstr
)
# prepare_reviews()
#
annotate_reviews()
prepare_annotated_reviews
()
annotate_reviews
()
#
prepare_annotated_reviews()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment