Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
4ac66e6b
Commit
4ac66e6b
authored
Jun 10, 2020
by
Joel Oksanen
Browse files
Improved sentiment annotation
parent
f0285f85
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
ADA/server/agent/SA/product_reviews_to_be_annotated.xml
0 → 100644
View file @
4ac66e6b
This diff is collapsed.
Click to expand it.
ADA/server/agent/sentiment_annotation.py
→
ADA/server/agent/
SA/
sentiment_annotation.py
View file @
4ac66e6b
...
...
@@ -12,17 +12,18 @@ import readchar
from
sty
import
fg
,
bg
,
ef
,
rs
from
wcwidth
import
wcswidth
data_location
=
'
amazon_data/amazon_reviews_us_pc
.tsv'
selected_reviews_location
=
'p
c
_reviews_to_be_annotated.xml'
data_location
=
'
data/reviews/5_products_reviews
.tsv'
selected_reviews_location
=
'p
roduct
_reviews_to_be_annotated.xml'
min_characters
=
0
max_characters
=
200
n
=
500
sentiment_mappings
=
{
'+'
:
'positive'
,
'0'
:
'neutral'
,
'-'
:
'negative'
,
'c'
:
'conflict'
}
ann_bgs
=
{
'positive'
:
bg
.
green
,
'neutral'
:
bg
.
blue
,
'negative'
:
bg
.
red
,
'conflict'
:
bg
.
yellow
}
ann_fgs
=
{
'positive'
:
fg
.
green
,
'neutral'
:
fg
.
blue
,
'negative'
:
fg
.
red
,
'conflict'
:
fg
.
yellow
}
annotated_reviews_location
=
'annotated_camera_reviews.xml'
included_labels
=
[
'NN'
,
'NNS'
,
'NP'
,
'NNP'
,
'NNPS'
,
'DT'
,
'CD'
,
'FW'
,
'PRP$'
]
nouns
=
[
'NN'
,
'NNS'
,
'NP'
,
'NNP'
,
'NNPS'
]
prepared_reviews_location
=
'annotated_
amazon_laptop
_reviews.xml'
prepared_reviews_location
=
'annotated_
5_products
_reviews
_2
.xml'
tokenizer
=
TweetTokenizer
()
sent_tokenizer
=
nltk
.
data
.
load
(
'tokenizers/punkt/english.pickle'
)
...
...
@@ -77,42 +78,16 @@ def is_opinion_target(tree):
def
prepare_reviews
():
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
# drop reviews with empty review body
reviews
=
reviews
[
~
reviews
[
'review_body'
].
isnull
()]
# laptop reviews
reviews
=
reviews
[
reviews
[
'product_title'
].
str
.
contains
(
'laptop'
,
case
=
False
,
na
=
False
)]
# try to filter out reviews for accessories
filter_words
=
[
'accessor'
,
'batter'
,
'charger'
,
'tripod'
,
'strap'
,
'case'
,
'bag'
,
'filter'
,
'backpack'
,
'kit'
,
'printer'
,
'adapter'
,
'album'
,
'surveillance'
,
'security'
]
filter_pat
=
''
for
word
in
filter_words
:
word_filter
=
'['
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
']'
+
word
[
1
:]
filter_pat
+=
word_filter
+
'|'
filter_pat
=
filter_pat
[:
-
1
]
reviews
=
reviews
[
~
reviews
[
'product_title'
].
str
.
contains
(
pat
=
filter_pat
,
regex
=
True
,
case
=
False
,
na
=
False
)]
# shuffle reviews
reviews
=
reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
# pick first n reviews
reviews
=
reviews
.
head
(
n
)
nlp
=
StanfordCoreNLP
(
r
'/Users/joeloksanen/stanford-corenlp-full-2018-10-05'
)
root
=
Element
(
'reviews'
)
for
_
,
review
in
reviews
.
iterrows
():
review_node
=
SubElement
(
root
,
'review'
)
review_node
.
set
(
'annotated'
,
'false'
)
id_node
=
SubElement
(
review_node
,
'review_id'
)
id_node
.
text
=
review
[
'review_id'
]
title_node
=
SubElement
(
review_node
,
'product_title'
)
title_node
.
text
=
review
[
'product_title'
]
id_node
.
text
=
review
[
'reviewerID'
]
text_node
=
SubElement
(
review_node
,
'review_body'
)
# reformat text
text
=
review
[
'review
_body
'
]
text
=
review
[
'review
Text
'
]
text
=
text
.
replace
(
'<br />'
,
'
\n
'
)
text
=
re
.
sub
(
'[.][.]+'
,
'...'
,
text
)
text
=
text
.
replace
(
'"'
,
'"'
)
...
...
@@ -122,21 +97,11 @@ def prepare_reviews():
sentences_node
=
SubElement
(
review_node
,
'sentences'
)
sentences
=
sent_tokenizer
.
tokenize
(
text
)
phrase_indices
=
[]
for
sentence
in
sentences
:
sentence_node
=
SubElement
(
sentences_node
,
'sentence'
)
sentence_text_node
=
SubElement
(
sentence_node
,
'text'
)
sentence_text_node
.
text
=
sentence
parse_tree_str
=
nlp
.
parse
(
sentence
)
parse_tree
=
Tree
.
fromstring
(
parse_tree_str
)
parse_tree_node
=
SubElement
(
sentence_node
,
'parse_tree'
)
parse_tree_node
.
text
=
parse_tree_str
tokenized_text_node
=
SubElement
(
sentence_node
,
'tokenized_text'
)
tokenized_text_node
.
text
=
' '
.
join
(
parse_tree
.
leaves
()).
replace
(
'``'
,
'""'
)
# save tree to file
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
' '
)
xmlstr
=
os
.
linesep
.
join
([
s
for
s
in
xmlstr
.
splitlines
()
if
s
.
strip
()])
...
...
@@ -177,8 +142,10 @@ def annotate_reviews():
print
(
bcolors
.
OKBLUE
+
'quit:
\'
q
\'
'
+
bcolors
.
ENDC
)
print
(
''
)
product_title
=
review
.
find
(
'product_title'
).
text
print
(
bcolors
.
OKGREEN
+
product_title
+
bcolors
.
ENDC
)
sent_str
=
''
for
c
,
sent
in
sentiment_mappings
.
items
():
sent_str
+=
'{}{}: {}{}, '
.
format
(
ann_fgs
[
sent
],
sent
,
c
,
bcolors
.
ENDC
)
print
(
sent_str
[:
-
2
])
print
(
''
)
text_row
=
''
...
...
@@ -310,7 +277,6 @@ def prepare_annotated_reviews():
for
review
in
annotated
:
for
sentence
in
review
.
find
(
'sentences'
):
text
=
sentence
.
find
(
'text'
).
text
tree_str
=
sentence
.
find
(
'parse_tree'
).
text
sentence_node
=
SubElement
(
prepared_root
,
'sentence'
)
text_node
=
SubElement
(
sentence_node
,
'text'
)
text_node
.
text
=
text
...
...
@@ -334,4 +300,4 @@ def prepare_annotated_reviews():
# prepare_reviews()
# annotate_reviews()
prepare_annotated_reviews
()
prepare_annotated_reviews
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment