Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
6c6eb5e2
Commit
6c6eb5e2
authored
Apr 06, 2020
by
Joel Oksanen
Browse files
Implemented simpler annotation method.
parent
3299e4af
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
ADA/review_annotation.py
View file @
6c6eb5e2
...
@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree
...
@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree
import
re
import
re
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
data_location
=
'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location
=
'reviews_to_be_annotated
2
.xml'
selected_reviews_location
=
'reviews_to_be_annotated.xml'
min_characters
=
0
min_characters
=
0
max_characters
=
200
max_characters
=
200
n
=
500
n
=
500
sentiment_mappings
=
{
'+'
:
'positive'
,
'0'
:
'neutral'
,
'-'
:
'negative'
}
sentiment_mappings
=
{
'+'
:
'positive'
,
'0'
:
'neutral'
,
'-'
:
'negative'
}
annotated_reviews_location
=
'annotated_camera_reviews2.xml'
annotated_reviews_location
=
'annotated_camera_reviews2.xml'
included_labels
=
[
'NN'
,
'NNS'
,
'NP'
,
'NNP'
,
'NNPS'
,
'DT'
,
'CD'
,
'FW'
,
'PRP$'
]
nouns
=
[
'NN'
,
'NNS'
,
'NP'
,
'NNP'
,
'NNPS'
]
tokenizer
=
TweetTokenizer
()
tokenizer
=
TweetTokenizer
()
sent_tokenizer
=
nltk
.
data
.
load
(
'tokenizers/punkt/english.pickle'
)
sent_tokenizer
=
nltk
.
data
.
load
(
'tokenizers/punkt/english.pickle'
)
nlp
=
StanfordCoreNLP
(
r
'/Users/joeloksanen/stanford-corenlp-full-2018-10-05'
)
class
bcolors
:
class
bcolors
:
HEADER
=
'
\033
[95m'
HEADER
=
'
\033
[95m'
...
@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree):
...
@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree):
# true if r1 contains r2
# true if r1 contains r2
def
range_contains
(
r1
,
r2
):
def
range_contains
(
r1
,
r2
):
return
r1
[
0
]
<=
r2
[
0
]
and
r1
[
1
]
>=
r2
[
1
]
return
r1
[
0
]
<=
r2
[
0
]
and
r1
[
1
]
>=
r2
[
1
]
and
Tree
.
fromstring
(
r2
[
2
])
in
Tree
.
fromstring
(
r1
[
2
]).
subtrees
()
def
in_range
(
r
,
n
):
def
in_range
(
r
,
n
):
return
r
[
0
]
<=
n
and
r
[
1
]
>=
n
return
r
[
0
]
<=
n
and
r
[
1
]
>=
n
...
@@ -52,25 +53,24 @@ def in_range(r, n):
...
@@ -52,25 +53,24 @@ def in_range(r, n):
# true if rs cover r
# true if rs cover r
def
range_cover
(
r
,
rs
):
def
range_cover
(
r
,
rs
):
for
n
in
range
(
r
[
0
],
r
[
1
]
+
1
):
for
n
in
range
(
r
[
0
],
r
[
1
]
+
1
):
for
other_r
in
rs
:
if
not
any
(
in_range
(
other_r
,
n
)
for
other_r
in
rs
):
if
in_range
(
other_r
,
n
):
return
False
continue
return
False
return
True
return
True
def
is_opinion_target
(
tree
):
return
(
tree
.
label
()
in
included_labels
and
all
(
sub
.
label
()
in
included_labels
or
(
sub
.
label
()
==
'PRP'
and
sub
[
0
].
lower
()
==
'it'
)
for
sub
in
tree
.
subtrees
()))
def
prepare_reviews
():
def
prepare_reviews
():
nlp
=
StanfordCoreNLP
(
r
'/Users/joeloksanen/stanford-corenlp-full-2018-10-05'
)
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
# drop reviews with empty review body
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
# # select reviews with specified review_body length
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# # filter out reviews with more than one sentence
# reviews = reviews[~reviews['review_body'].str.contains(pat='[.][^.]|<br />|[!][^!]|[?][^?]', regex=True)]
# try to filter out reviews for camera accessories
# try to filter out reviews for camera accessories
filter_words
=
[
'accessor'
,
'battery'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
filter_words
=
[
'accessor'
,
'battery'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
...
@@ -93,6 +93,7 @@ def prepare_reviews():
...
@@ -93,6 +93,7 @@ def prepare_reviews():
for
index
,
review
in
reviews
.
iterrows
():
for
index
,
review
in
reviews
.
iterrows
():
review_node
=
SubElement
(
root
,
'review'
)
review_node
=
SubElement
(
root
,
'review'
)
review_node
.
set
(
'annotated'
,
'false'
)
id_node
=
SubElement
(
review_node
,
'review_id'
)
id_node
=
SubElement
(
review_node
,
'review_id'
)
id_node
.
text
=
review
[
'review_id'
]
id_node
.
text
=
review
[
'review_id'
]
title_node
=
SubElement
(
review_node
,
'product_title'
)
title_node
=
SubElement
(
review_node
,
'product_title'
)
...
@@ -100,6 +101,7 @@ def prepare_reviews():
...
@@ -100,6 +101,7 @@ def prepare_reviews():
text_node
=
SubElement
(
review_node
,
'review_body'
)
text_node
=
SubElement
(
review_node
,
'review_body'
)
# reformat text
# reformat text
text
=
review
[
'review_body'
]
text
=
review
[
'review_body'
]
text
=
text
.
replace
(
'<br />'
,
'
\n
'
)
text
=
re
.
sub
(
'[.][.]+'
,
'...'
,
text
)
text
=
re
.
sub
(
'[.][.]+'
,
'...'
,
text
)
text_node
.
text
=
text
text_node
.
text
=
text
...
@@ -115,14 +117,17 @@ def prepare_reviews():
...
@@ -115,14 +117,17 @@ def prepare_reviews():
parse_tree_str
=
nlp
.
parse
(
sentence
)
parse_tree_str
=
nlp
.
parse
(
sentence
)
parse_tree
=
Tree
.
fromstring
(
parse_tree_str
)
parse_tree
=
Tree
.
fromstring
(
parse_tree_str
)
parse_tree_node
=
SubElement
(
sentence_node
,
'parse_tree'
)
parse_tree_node
.
text
=
parse_tree_str
tokenized_text_node
=
SubElement
(
sentence_node
,
'tokenized_text'
)
tokenized_text_node
=
SubElement
(
sentence_node
,
'tokenized_text'
)
tokenized_text_node
.
text
=
' '
.
join
(
parse_tree
.
leaves
())
tokenized_text_node
.
text
=
' '
.
join
(
parse_tree
.
leaves
())
ranges
=
[]
ranges
=
[]
for
subtree
in
parse_tree
.
subtrees
():
for
subtree
in
parse_tree
.
subtrees
():
if
subtree
.
label
()
==
'NP'
:
if
is_opinion_target
(
subtree
)
:
start
,
end
=
get_leaf_indices
(
parse_tree
,
subtree
)
start
,
end
=
get_leaf_indices
(
parse_tree
,
subtree
)
ranges
.
append
((
start
,
end
))
ranges
.
append
((
start
,
end
,
str
(
subtree
)
))
ranges
.
sort
(
key
=
(
lambda
t
:
t
[
1
]
-
t
[
0
]),
reverse
=
True
)
ranges
.
sort
(
key
=
(
lambda
t
:
t
[
1
]
-
t
[
0
]),
reverse
=
True
)
ranges_to_delete
=
[]
ranges_to_delete
=
[]
...
@@ -133,135 +138,105 @@ def prepare_reviews():
...
@@ -133,135 +138,105 @@ def prepare_reviews():
elif
subranges
:
elif
subranges
:
ranges_to_delete
.
append
(
range
)
ranges_to_delete
.
append
(
range
)
unique_ranges
=
list
(
set
(
ranges
)
-
set
(
ranges_to_delete
))
unique_ranges
=
list
(
filter
(
lambda
r
:
Tree
.
fromstring
(
r
[
2
]).
label
()
in
nouns
,
set
(
ranges
)
-
set
(
ranges_to_delete
)))
unique_ranges
.
sort
(
key
=
(
lambda
t
:
t
[
0
]))
unique_ranges
.
sort
(
key
=
(
lambda
t
:
t
[
0
]))
phrase_ranges_node
=
SubElement
(
sentence_node
,
'phrase_ranges'
)
phrase_ranges_node
=
SubElement
(
sentence_node
,
'phrase_ranges'
)
for
range
in
unique_ranges
:
for
range
in
unique_ranges
:
phrase_range_node
=
SubElement
(
phrase_ranges_node
,
'phrase_range'
)
phrase_range_node
=
SubElement
(
phrase_ranges_node
,
'phrase_range'
)
phrase_range_node
.
text
=
'{},{}'
.
format
(
range
[
0
],
range
[
1
])
phrase_range_node
.
text
=
'{},{}'
.
format
(
range
[
0
],
range
[
1
])
# save selected reviews
# save tree to file
# save tree to file
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
' '
)
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
' '
)
xmlstr
=
os
.
linesep
.
join
([
s
for
s
in
xmlstr
.
splitlines
()
if
s
.
strip
()])
xmlstr
=
os
.
linesep
.
join
([
s
for
s
in
xmlstr
.
splitlines
()
if
s
.
strip
()])
with
open
(
selected_reviews_location
,
'w'
)
as
f
:
with
open
(
selected_reviews_location
,
'w'
)
as
f
:
f
.
write
(
xmlstr
)
f
.
write
(
xmlstr
)
# reviews.to_csv(selected_reviews_location, sep='\t', index=False)
def
annotate_reviews
():
def
annotate_reviews
():
row_character_count
=
100
row_character_count
=
100
reviews
=
parse
(
selected_reviews_location
)
# pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
reviews
=
parse
(
selected_reviews_location
)
# pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
root
=
reviews
.
getroot
()
annotated_reviews
=
parse
(
annotated_reviews_location
)
if
os
.
path
.
isfile
(
annotated_reviews_location
)
else
None
root
=
annotated_reviews
.
getroot
()
if
annotated_reviews
else
Element
(
'data'
)
n_annotated
=
len
(
root
)
# filter out reviews that have been annotated already
# filter out reviews that have been annotated already
annotated_review_ids
=
[
id_node
.
text
for
id_node
in
root
.
iter
(
'review_id'
)]
not_annotated
=
[
review
for
review
in
root
if
review
.
attrib
[
'annotated'
]
==
'false'
]
n_annotated
=
len
(
root
)
-
len
(
not_annotated
)
for
review
in
reviews
.
findall
(
'node2'
):
if
review
.
find
(
'review_id'
).
text
in
annotated_review_ids
:
reviews
.
remove
(
review
)
os
.
system
(
'clear'
)
os
.
system
(
'clear'
)
for
review
in
reviews
:
for
review
in
not_annotated
:
print
(
bcolors
.
OKBLUE
+
'{} reviews annotated'
.
format
(
n_annotated
)
+
bcolors
.
ENDC
)
for
sentence
in
review
.
find
(
'sentences'
):
print
(
''
)
tokens
=
sentence
.
find
(
'tokenized_text'
).
text
.
split
(
' '
)
phrase_ranges
=
sentence
.
find
(
'phrase_ranges'
)
product_title
=
review
[
'product_title'
]
non_ranges
=
[]
print
(
bcolors
.
OKGREEN
+
product_title
+
bcolors
.
ENDC
)
for
r
in
phrase_ranges
:
print
(
''
)
print
(
bcolors
.
WARNING
+
'{} reviews annotated'
.
format
(
n_annotated
)
+
bcolors
.
ENDC
)
print
(
''
)
print
(
bcolors
.
OKBLUE
+
'
\'
+
\'
: positive sentiment'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'
\'
0
\'
: neutral/no sentiment'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'
\'
-
\'
: negative sentiment'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'
\'
n
\'
: not an opinion target'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'
\'
q
\'
: quit'
+
bcolors
.
ENDC
)
print
(
''
)
product_title
=
review
.
find
(
'product_title'
).
text
print
(
bcolors
.
UNDERLINE
+
product_title
+
bcolors
.
ENDC
)
print
(
''
)
start
=
int
(
r
.
text
.
split
(
','
)[
0
])
end
=
int
(
r
.
text
.
split
(
','
)[
1
])
text
=
review
[
'review_body'
]
tokens
=
tokenizer
.
tokenize
(
text
)
index_row
=
''
text_row
=
''
for
t
in
range
(
len
(
tokens
)):
space
=
len
(
tokens
[
t
])
-
len
(
str
(
t
))
token_text
=
tokens
[
t
]
+
' '
index_text
=
str
(
t
)
+
' '
if
space
>
0
:
index_text
=
' '
*
math
.
floor
(
space
/
2
)
+
index_text
+
' '
*
math
.
ceil
(
space
/
2
)
elif
space
<
0
:
space
=
abs
(
space
)
token_text
=
' '
*
math
.
floor
(
space
/
2
)
+
token_text
+
' '
*
math
.
ceil
(
space
/
2
)
index_row
+=
index_text
text_row
+=
token_text
if
t
+
1
<
len
(
tokens
)
and
len
(
index_row
)
+
len
(
tokens
[
t
+
1
])
+
1
>
row_character_count
:
print
(
bcolors
.
WARNING
+
index_row
+
bcolors
.
ENDC
)
print
(
text_row
)
index_row
=
''
index_row
=
''
text_row
=
''
text_row
=
''
print
(
bcolors
.
WARNING
+
index_row
+
bcolors
.
ENDC
)
for
t
in
range
(
len
(
tokens
)):
print
(
text_row
)
space
=
len
(
tokens
[
t
])
-
len
(
str
(
t
))
print
(
''
)
token_text
=
tokens
[
t
]
+
' '
annotations
=
[]
if
t
in
range
(
start
,
end
+
1
):
while
True
:
token_text
=
bcolors
.
BOLD
+
token_text
+
bcolors
.
ENDC
task
=
input
(
'Enter
\'
a
\'
to add an argument,
\'
c
\'
to save annotation and continue,
\'
s
\'
to skip, or
\'
q
\'
to quit: '
)
text_row
+=
token_text
if
task
==
'a'
:
if
t
+
1
<
len
(
tokens
)
and
len
(
text_row
)
+
len
(
tokens
[
t
+
1
])
+
1
>
row_character_count
:
rng
=
None
print
(
text_row
)
arg
=
''
text_row
=
''
sentiment
=
''
print
(
text_row
)
while
not
rng
:
print
(
''
)
inp
=
input
(
'Enter the index (range) of the argument (in the form x or x,y): '
)
if
inp
.
isdigit
():
while
True
:
rng
=
(
int
(
inp
),
int
(
inp
))
inp
=
input
(
'Enter the sentiment expressed towards the highlighted argument: '
)
elif
(
','
in
inp
and
len
(
inp
.
split
(
','
))
==
2
and
inp
.
split
(
','
)[
0
].
isdigit
()
and
inp
.
split
(
','
)[
1
].
isdigit
()):
rng
=
(
int
(
inp
.
split
(
','
)[
0
]),
int
(
inp
.
split
(
','
)[
1
]))
while
not
arg
:
inp
=
input
(
'Enter argument type (
\'
p
\'
for product,
\'
f
\'
for feature): '
)
if
inp
in
[
'p'
,
'f'
]:
arg
=
inp
while
not
sentiment
:
inp
=
input
(
'Enter the sentiment (
\'
+
\'
,
\'
0
\'
,
\'
-
\'
) expressed towards the argument: '
)
if
inp
in
[
'+'
,
'0'
,
'-'
]:
if
inp
in
[
'+'
,
'0'
,
'-'
]:
sentiment
=
inp
r
.
set
(
'sentiment_annotation'
,
sentiment_mappings
[
inp
])
annotations
.
append
((
rng
,
arg
,
sentiment
))
os
.
system
(
'clear'
)
break
if
task
in
[
'c'
,
's'
,
'q'
]:
if
task
in
[
'c'
,
's'
]:
if
inp
in
[
'n'
]:
n_annotated
+=
1
non_ranges
.
append
(
r
)
# save annotations to tree
os
.
system
(
'clear'
)
review_node
=
SubElement
(
root
,
'review'
)
break
id_node
=
SubElement
(
review_node
,
'review_id'
)
id_node
.
text
=
review
[
'review_id'
]
elif
inp
in
[
'q'
]:
text_node
=
SubElement
(
review_node
,
'text'
)
os
.
system
(
'clear'
)
text_node
.
text
=
text
break
if
task
==
'c'
:
annotations_node
=
SubElement
(
review_node
,
'annotations'
)
if
inp
==
'q'
:
for
annotation
in
annotations
:
break
annotation_node
=
SubElement
(
annotations_node
,
'annotation'
)
range_node
=
SubElement
(
annotation_node
,
'range'
)
for
non_range
in
non_ranges
:
range_node
.
text
=
'{},{}'
.
format
(
annotation
[
0
][
0
],
annotation
[
0
][
1
])
phrase_ranges
.
remove
(
non_range
)
arg_node
=
SubElement
(
annotation_node
,
'argument'
)
if
inp
==
'q'
:
arg_node
.
text
=
'product'
if
annotation
[
1
]
==
'p'
else
'feature'
sent_node
=
SubElement
(
annotation_node
,
'sentiment'
)
sent_node
.
text
=
sentiment_mappings
[
annotation
[
2
]]
# save tree to file
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
' '
)
xmlstr
=
os
.
linesep
.
join
([
s
for
s
in
xmlstr
.
splitlines
()
if
s
.
strip
()])
with
open
(
annotated_reviews_location
,
'w'
)
as
f
:
f
.
write
(
xmlstr
)
os
.
system
(
'clear'
)
break
break
if
task
==
'q'
:
if
inp
==
'q'
:
break
break
else
:
n_annotated
+=
1
review
.
set
(
'annotated'
,
'true'
)
# save tree to file
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
' '
)
xmlstr
=
os
.
linesep
.
join
([
s
for
s
in
xmlstr
.
splitlines
()
if
s
.
strip
()])
with
open
(
selected_reviews_location
,
'w'
)
as
f
:
f
.
write
(
xmlstr
)
# prepare_reviews()
# prepare_reviews()
# annotate_reviews()
annotate_reviews
()
prepare_reviews
()
ADA/reviews_to_be_annotated.xml
0 → 100644
View file @
6c6eb5e2
This diff is collapsed.
Click to expand it.
ADA/reviews_to_be_annotated2.xml
deleted
100644 → 0
View file @
3299e4af
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment