Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
individual_project
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Joel Oksanen
individual_project
Commits
16cb2dcc
Commit
16cb2dcc
authored
4 years ago
by
Joel Oksanen
Browse files
Options
Downloads
Patches
Plain Diff
Changed review annotation to arrow key based
parent
52741466
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
ADA/SA/bert_analyzer.py
+5
-5
5 additions, 5 deletions
ADA/SA/bert_analyzer.py
ADA/review_annotation.py
+100
-80
100 additions, 80 deletions
ADA/review_annotation.py
with
105 additions
and
85 deletions
ADA/SA/bert_analyzer.py
+
5
−
5
View file @
16cb2dcc
...
@@ -28,8 +28,8 @@ class BertAnalyzer:
...
@@ -28,8 +28,8 @@ class BertAnalyzer:
self
.
net
.
load_state_dict
(
torch
.
load
(
trained_model_path
))
self
.
net
.
load_state_dict
(
torch
.
load
(
trained_model_path
))
self
.
net
.
eval
()
self
.
net
.
eval
()
def
train
(
self
):
def
train
(
self
,
dataset
):
train_data
=
BertDataset
(
semeval_2014_train_path
)
train_data
=
BertDataset
(
dataset
)
train_loader
=
DataLoader
(
train_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
num_workers
=
4
,
train_loader
=
DataLoader
(
train_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
num_workers
=
4
,
collate_fn
=
generate_batch
)
collate_fn
=
generate_batch
)
...
@@ -65,8 +65,8 @@ class BertAnalyzer:
...
@@ -65,8 +65,8 @@ class BertAnalyzer:
torch
.
save
(
net
.
state_dict
(),
trained_model_path
)
torch
.
save
(
net
.
state_dict
(),
trained_model_path
)
def
evaluate
(
self
):
def
evaluate
(
self
,
dataset
):
test_data
=
BertDataset
(
semeval_2014_test_path
)
test_data
=
BertDataset
(
dataset
)
test_loader
=
DataLoader
(
test_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
False
,
num_workers
=
4
,
test_loader
=
DataLoader
(
test_data
,
batch_size
=
BATCH_SIZE
,
shuffle
=
False
,
num_workers
=
4
,
collate_fn
=
generate_batch
)
collate_fn
=
generate_batch
)
...
@@ -93,4 +93,4 @@ class BertAnalyzer:
...
@@ -93,4 +93,4 @@ class BertAnalyzer:
sentiment_analyzer
=
BertAnalyzer
()
sentiment_analyzer
=
BertAnalyzer
()
sentiment_analyzer
.
load_saved
()
sentiment_analyzer
.
load_saved
()
sentiment_analyzer
.
evaluate
()
sentiment_analyzer
.
evaluate
(
semeval_2014_test_path
)
\ No newline at end of file
\ No newline at end of file
This diff is collapsed.
Click to expand it.
ADA/review_annotation.py
+
100
−
80
View file @
16cb2dcc
...
@@ -8,6 +8,9 @@ import nltk.data
...
@@ -8,6 +8,9 @@ import nltk.data
from
stanfordcorenlp
import
StanfordCoreNLP
from
stanfordcorenlp
import
StanfordCoreNLP
from
nltk.tree
import
ParentedTree
as
Tree
from
nltk.tree
import
ParentedTree
as
Tree
import
re
import
re
import
readchar
from
sty
import
fg
,
bg
,
ef
,
rs
from
wcwidth
import
wcswidth
data_location
=
'
amazon_reviews_us_Camera_v1_00.tsv
'
data_location
=
'
amazon_reviews_us_Camera_v1_00.tsv
'
selected_reviews_location
=
'
reviews_to_be_annotated.xml
'
selected_reviews_location
=
'
reviews_to_be_annotated.xml
'
...
@@ -15,6 +18,7 @@ min_characters = 0
...
@@ -15,6 +18,7 @@ min_characters = 0
max_characters
=
200
max_characters
=
200
n
=
500
n
=
500
sentiment_mappings
=
{
'
+
'
:
'
positive
'
,
'
0
'
:
'
neutral
'
,
'
-
'
:
'
negative
'
,
'
c
'
:
'
conflict
'
}
sentiment_mappings
=
{
'
+
'
:
'
positive
'
,
'
0
'
:
'
neutral
'
,
'
-
'
:
'
negative
'
,
'
c
'
:
'
conflict
'
}
ann_bgs
=
{
'
positive
'
:
bg
.
green
,
'
neutral
'
:
bg
.
li_black
,
'
negative
'
:
bg
.
red
,
'
conflict
'
:
bg
.
yellow
}
annotated_reviews_location
=
'
annotated_camera_reviews.xml
'
annotated_reviews_location
=
'
annotated_camera_reviews.xml
'
included_labels
=
[
'
NN
'
,
'
NNS
'
,
'
NP
'
,
'
NNP
'
,
'
NNPS
'
,
'
DT
'
,
'
CD
'
,
'
FW
'
,
'
PRP$
'
]
included_labels
=
[
'
NN
'
,
'
NNS
'
,
'
NP
'
,
'
NNP
'
,
'
NNPS
'
,
'
DT
'
,
'
CD
'
,
'
FW
'
,
'
PRP$
'
]
nouns
=
[
'
NN
'
,
'
NNS
'
,
'
NP
'
,
'
NNP
'
,
'
NNPS
'
]
nouns
=
[
'
NN
'
,
'
NNS
'
,
'
NP
'
,
'
NNP
'
,
'
NNPS
'
]
...
@@ -23,6 +27,7 @@ prepared_reviews_location = 'prepared_amazon_camera_reviews.xml'
...
@@ -23,6 +27,7 @@ prepared_reviews_location = 'prepared_amazon_camera_reviews.xml'
tokenizer
=
TweetTokenizer
()
tokenizer
=
TweetTokenizer
()
sent_tokenizer
=
nltk
.
data
.
load
(
'
tokenizers/punkt/english.pickle
'
)
sent_tokenizer
=
nltk
.
data
.
load
(
'
tokenizers/punkt/english.pickle
'
)
class
bcolors
:
class
bcolors
:
HEADER
=
'
\033
[95m
'
HEADER
=
'
\033
[95m
'
OKBLUE
=
'
\033
[94m
'
OKBLUE
=
'
\033
[94m
'
...
@@ -33,6 +38,7 @@ class bcolors:
...
@@ -33,6 +38,7 @@ class bcolors:
BOLD
=
'
\033
[1m
'
BOLD
=
'
\033
[1m
'
UNDERLINE
=
'
\033
[4m
'
UNDERLINE
=
'
\033
[4m
'
def
get_leaf_indices
(
tree
,
phrase_tree
):
def
get_leaf_indices
(
tree
,
phrase_tree
):
phrase_tree_pos
=
phrase_tree
.
treeposition
()
phrase_tree_pos
=
phrase_tree
.
treeposition
()
start
=
0
start
=
0
...
@@ -43,26 +49,31 @@ def get_leaf_indices(tree, phrase_tree):
...
@@ -43,26 +49,31 @@ def get_leaf_indices(tree, phrase_tree):
end
+=
1
end
+=
1
return
(
start
,
end
)
return
(
start
,
end
)
# true if r1 contains r2
# true if r1 contains r2
def
range_contains
(
r1
,
r2
):
def
range_contains
(
r1
,
r2
):
return
r1
[
0
]
<=
r2
[
0
]
and
r1
[
1
]
>=
r2
[
1
]
and
Tree
.
fromstring
(
r2
[
2
])
in
Tree
.
fromstring
(
r1
[
2
]).
subtrees
()
return
r1
[
0
]
<=
r2
[
0
]
and
r1
[
1
]
>=
r2
[
1
]
and
Tree
.
fromstring
(
r2
[
2
])
in
Tree
.
fromstring
(
r1
[
2
]).
subtrees
()
def
in_range
(
r
,
n
):
def
in_range
(
r
,
n
):
return
r
[
0
]
<=
n
and
r
[
1
]
>=
n
return
r
[
0
]
<=
n
and
r
[
1
]
>=
n
# true if rs cover r
# true if rs cover r
def
range_cover
(
r
,
rs
):
def
range_cover
(
r
,
rs
):
for
n
in
range
(
r
[
0
],
r
[
1
]
+
1
):
for
n
in
range
(
r
[
0
],
r
[
1
]
+
1
):
if
not
any
(
in_range
(
other_r
,
n
)
for
other_r
in
rs
):
if
not
any
(
in_range
(
other_r
,
n
)
for
other_r
in
rs
):
return
False
return
False
return
True
return
True
def
is_opinion_target
(
tree
):
def
is_opinion_target
(
tree
):
return
(
tree
.
label
()
in
included_labels
and
return
(
tree
.
label
()
in
included_labels
and
all
(
sub
.
label
()
in
included_labels
or
all
(
sub
.
label
()
in
included_labels
or
(
sub
.
label
()
==
'
PRP
'
and
sub
[
0
].
lower
()
==
'
it
'
)
(
sub
.
label
()
==
'
PRP
'
and
sub
[
0
].
lower
()
==
'
it
'
)
for
sub
in
tree
.
subtrees
()))
for
sub
in
tree
.
subtrees
()))
def
prepare_reviews
():
def
prepare_reviews
():
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
reviews
=
pd
.
read_csv
(
data_location
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
...
@@ -71,13 +82,13 @@ def prepare_reviews():
...
@@ -71,13 +82,13 @@ def prepare_reviews():
# try to filter out reviews for camera accessories
# try to filter out reviews for camera accessories
filter_words
=
[
'
accessor
'
,
'
battery
'
,
'
charger
'
,
'
tripod
'
,
'
strap
'
,
'
case
'
,
'
bag
'
,
filter_words
=
[
'
accessor
'
,
'
battery
'
,
'
charger
'
,
'
tripod
'
,
'
strap
'
,
'
case
'
,
'
bag
'
,
'
backpack
'
,
'
kit
'
,
'
printer
'
,
'
adapter
'
,
'
album
'
,
'
surveillance
'
,
'
security
'
]
'
backpack
'
,
'
kit
'
,
'
printer
'
,
'
adapter
'
,
'
album
'
,
'
surveillance
'
,
'
security
'
]
filter_pat
=
''
filter_pat
=
''
for
word
in
filter_words
:
for
word
in
filter_words
:
word_filter
=
'
[
'
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
'
]
'
+
word
[
1
:]
word_filter
=
'
[
'
+
word
[
0
].
upper
()
+
word
[
0
].
lower
()
+
'
]
'
+
word
[
1
:]
filter_pat
+=
word_filter
+
'
|
'
filter_pat
+=
word_filter
+
'
|
'
filter_pat
=
filter_pat
[:
-
1
]
filter_pat
=
filter_pat
[:
-
1
]
reviews
=
reviews
[
~
reviews
[
'
product_title
'
].
str
.
contains
(
pat
=
filter_pat
,
regex
=
True
)]
reviews
=
reviews
[
~
reviews
[
'
product_title
'
].
str
.
contains
(
pat
=
filter_pat
,
regex
=
True
)]
# shuffle reviews
# shuffle reviews
reviews
=
reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
reviews
=
reviews
.
sample
(
frac
=
1
).
reset_index
(
drop
=
True
)
...
@@ -121,7 +132,7 @@ def prepare_reviews():
...
@@ -121,7 +132,7 @@ def prepare_reviews():
parse_tree_node
.
text
=
parse_tree_str
parse_tree_node
.
text
=
parse_tree_str
tokenized_text_node
=
SubElement
(
sentence_node
,
'
tokenized_text
'
)
tokenized_text_node
=
SubElement
(
sentence_node
,
'
tokenized_text
'
)
tokenized_text_node
.
text
=
'
'
.
join
(
parse_tree
.
leaves
()).
replace
(
'
``
'
,
'""'
)
tokenized_text_node
.
text
=
'
'
.
join
(
parse_tree
.
leaves
()).
replace
(
'
``
'
,
'""'
)
# save tree to file
# save tree to file
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
'
'
)
xmlstr
=
minidom
.
parseString
(
tostring
(
root
)).
toprettyxml
(
indent
=
'
'
)
...
@@ -131,6 +142,7 @@ def prepare_reviews():
...
@@ -131,6 +142,7 @@ def prepare_reviews():
print
(
'
Obtained and parsed
'
,
len
(
reviews
),
'
reviews
'
)
print
(
'
Obtained and parsed
'
,
len
(
reviews
),
'
reviews
'
)
def
annotate_reviews
():
def
annotate_reviews
():
row_character_count
=
100
row_character_count
=
100
reviews
=
parse
(
selected_reviews_location
)
reviews
=
parse
(
selected_reviews_location
)
...
@@ -143,69 +155,69 @@ def annotate_reviews():
...
@@ -143,69 +155,69 @@ def annotate_reviews():
for
review
in
not_annotated
:
for
review
in
not_annotated
:
for
sentence
in
review
.
find
(
'
sentences
'
):
for
sentence
in
review
.
find
(
'
sentences
'
):
tokens
=
sentence
.
find
(
'
tokenized_text
'
).
text
.
split
(
'
'
)
text
=
sentence
.
find
(
'
text
'
).
text
cursor_pos
=
0
os
.
system
(
'
clear
'
)
start
=
None
end
=
None
print
(
bcolors
.
OKBLUE
+
'
{} reviews annotated
'
.
format
(
n_annotated
)
+
bcolors
.
ENDC
)
print
(
''
)
print
(
bcolors
.
OKBLUE
+
'
annotation: [
\'
i
\'
|
\'
n,m
\'
] [
\'
+
\'
|
\'
0
\'
|
\'
-
\'
|
\'
c
\'
]
'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'
next:
\'
n
\'
'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'
skip:
\'
s
\'
'
+
bcolors
.
ENDC
)
print
(
bcolors
.
OKBLUE
+
'
quit:
\'
q
\'
'
+
bcolors
.
ENDC
)
print
(
''
)
product_title
=
review
.
find
(
'
product_title
'
).
text
print
(
bcolors
.
OKGREEN
+
product_title
+
bcolors
.
ENDC
)
print
(
''
)
index_row
=
''
text_row
=
''
for
t
in
range
(
len
(
tokens
)):
space
=
len
(
tokens
[
t
])
-
len
(
str
(
t
))
token_text
=
tokens
[
t
]
+
'
'
index_text
=
str
(
t
)
+
'
'
if
space
>
0
:
index_text
=
'
'
*
math
.
floor
(
space
/
2
)
+
index_text
+
'
'
*
math
.
ceil
(
space
/
2
)
elif
space
<
0
:
space
=
abs
(
space
)
token_text
=
'
'
*
math
.
floor
(
space
/
2
)
+
token_text
+
'
'
*
math
.
ceil
(
space
/
2
)
index_row
+=
index_text
text_row
+=
token_text
if
t
+
1
<
len
(
tokens
)
and
len
(
index_row
)
+
len
(
tokens
[
t
+
1
])
+
1
>
row_character_count
:
print
(
bcolors
.
WARNING
+
index_row
+
bcolors
.
ENDC
)
print
(
text_row
)
index_row
=
''
text_row
=
''
print
(
bcolors
.
WARNING
+
index_row
+
bcolors
.
ENDC
)
print
(
text_row
)
print
(
''
)
annotations
=
[]
annotations
=
[]
while
True
:
while
True
:
task
=
input
(
'
:
'
)
os
.
system
(
'
clear
'
)
if
len
(
task
.
split
(
'
'
))
==
2
:
print
(
bcolors
.
OKBLUE
+
'
{} reviews annotated
'
.
format
(
n_annotated
)
+
bcolors
.
ENDC
)
rng
=
None
print
(
''
)
sentiment
=
''
print
(
bcolors
.
OKBLUE
+
'
next:
\'
n
\'
'
+
bcolors
.
ENDC
)
fst
=
task
.
split
(
'
'
)[
0
]
print
(
bcolors
.
OKBLUE
+
'
skip:
\'
s
\'
'
+
bcolors
.
ENDC
)
if
fst
.
isdigit
():
print
(
bcolors
.
OKBLUE
+
'
quit:
\'
q
\'
'
+
bcolors
.
ENDC
)
rng
=
(
int
(
fst
),
int
(
fst
))
print
(
''
)
elif
(
'
,
'
in
fst
and
len
(
fst
.
split
(
'
,
'
))
==
2
and
fst
.
split
(
'
,
'
)[
0
].
isdigit
()
and
fst
.
split
(
'
,
'
)[
1
].
isdigit
()):
product_title
=
review
.
find
(
'
product_title
'
).
text
rng
=
(
int
(
fst
.
split
(
'
,
'
)[
0
]),
int
(
fst
.
split
(
'
,
'
)[
1
]))
print
(
bcolors
.
OKGREEN
+
product_title
+
bcolors
.
ENDC
)
print
(
''
)
snd
=
task
.
split
(
'
'
)[
1
]
if
snd
in
sentiment_mappings
.
keys
():
text_row
=
''
sentiment
=
snd
for
t
in
range
(
len
(
text
)):
char
=
text
[
t
]
if
rng
and
sentiment
:
if
t
==
cursor_pos
:
annotations
.
append
((
rng
,
sentiment
))
char
=
bg
.
blue
+
char
+
bg
.
rs
for
ann
in
annotations
:
if
t
in
range
(
ann
[
0
][
0
],
ann
[
0
][
1
]):
char
=
ann_bgs
[
ann
[
1
]]
+
char
+
bg
.
rs
text_row
+=
char
if
(
t
+
1
)
%
row_character_count
==
0
:
print
(
text_row
)
text_row
=
''
print
(
text_row
)
print
(
''
)
task
=
readchar
.
readkey
()
if
task
==
readchar
.
key
.
RIGHT
:
cursor_pos
=
min
(
cursor_pos
+
1
,
len
(
text
)
-
1
)
if
task
==
readchar
.
key
.
LEFT
:
cursor_pos
=
max
(
cursor_pos
-
1
,
0
)
if
task
==
readchar
.
key
.
DOWN
:
cursor_pos
=
min
(
cursor_pos
+
row_character_count
,
len
(
text
)
-
1
)
if
task
==
readchar
.
key
.
UP
:
cursor_pos
=
max
(
cursor_pos
-
row_character_count
,
0
)
if
task
==
readchar
.
key
.
SPACE
:
if
start
==
None
:
start
=
cursor_pos
elif
end
==
None
and
cursor_pos
>=
start
:
end
=
cursor_pos
+
1
rng
=
(
start
,
end
)
while
True
:
inp
=
input
(
'
Sentiment for {},{}:
'
.
format
(
start
,
end
-
1
))
if
inp
in
sentiment_mappings
.
keys
():
annotations
.
append
((
rng
,
sentiment_mappings
[
inp
]))
start
=
None
end
=
None
cursor_pos
=
min
(
cursor_pos
+
1
,
len
(
text
)
-
1
)
break
if
task
in
[
'
n
'
,
'
s
'
,
'
q
'
]:
if
task
in
[
'
n
'
,
'
s
'
,
'
q
'
]:
if
task
in
[
'
n
'
]:
if
task
in
[
'
n
'
]:
...
@@ -216,7 +228,7 @@ def annotate_reviews():
...
@@ -216,7 +228,7 @@ def annotate_reviews():
range_node
=
SubElement
(
annotation_node
,
'
range
'
)
range_node
=
SubElement
(
annotation_node
,
'
range
'
)
range_node
.
text
=
'
{},{}
'
.
format
(
annotation
[
0
][
0
],
annotation
[
0
][
1
])
range_node
.
text
=
'
{},{}
'
.
format
(
annotation
[
0
][
0
],
annotation
[
0
][
1
])
sent_node
=
SubElement
(
annotation_node
,
'
sentiment
'
)
sent_node
=
SubElement
(
annotation_node
,
'
sentiment
'
)
sent_node
.
text
=
sentiment_mappings
[
annotation
[
1
]
]
sent_node
.
text
=
annotation
[
1
]
break
break
if
task
==
'
q
'
:
if
task
==
'
q
'
:
break
break
...
@@ -232,6 +244,7 @@ def annotate_reviews():
...
@@ -232,6 +244,7 @@ def annotate_reviews():
with
open
(
selected_reviews_location
,
'
w
'
)
as
f
:
with
open
(
selected_reviews_location
,
'
w
'
)
as
f
:
f
.
write
(
xmlstr
)
f
.
write
(
xmlstr
)
def
longest_common_subsequence
(
x
,
y
):
def
longest_common_subsequence
(
x
,
y
):
seq
=
[]
seq
=
[]
for
i
in
range
(
min
(
len
(
x
),
len
(
y
))):
for
i
in
range
(
min
(
len
(
x
),
len
(
y
))):
...
@@ -241,6 +254,7 @@ def longest_common_subsequence(x, y):
...
@@ -241,6 +254,7 @@ def longest_common_subsequence(x, y):
return
tuple
(
seq
)
return
tuple
(
seq
)
def
labelled_tree_str
(
tree_str
,
start
,
end
):
def
labelled_tree_str
(
tree_str
,
start
,
end
):
tree
=
Tree
.
fromstring
(
tree_str
)
tree
=
Tree
.
fromstring
(
tree_str
)
start_pos
=
tree
.
leaf_treeposition
(
start
)
start_pos
=
tree
.
leaf_treeposition
(
start
)
...
@@ -248,7 +262,7 @@ def labelled_tree_str(tree_str, start, end):
...
@@ -248,7 +262,7 @@ def labelled_tree_str(tree_str, start, end):
# find highest parent node common to start and end
# find highest parent node common to start and end
if
start
==
end
:
if
start
==
end
:
parent_pos
=
start_pos
[:
len
(
start_pos
)
-
1
]
parent_pos
=
start_pos
[:
len
(
start_pos
)
-
1
]
else
:
else
:
parent_pos
=
longest_common_subsequence
(
start_pos
,
end_pos
)
parent_pos
=
longest_common_subsequence
(
start_pos
,
end_pos
)
parent_node
=
tree
[
parent_pos
]
parent_node
=
tree
[
parent_pos
]
...
@@ -257,7 +271,7 @@ def labelled_tree_str(tree_str, start, end):
...
@@ -257,7 +271,7 @@ def labelled_tree_str(tree_str, start, end):
parent_pos
=
parent_pos
[:
len
(
parent_pos
)
-
1
]
parent_pos
=
parent_pos
[:
len
(
parent_pos
)
-
1
]
# remove branches between start and end inclusive
# remove branches between start and end inclusive
child_index_rng
=
range
(
start_pos
[
len
(
parent_pos
)],
end_pos
[
len
(
parent_pos
)]
+
1
)
child_index_rng
=
range
(
start_pos
[
len
(
parent_pos
)],
end_pos
[
len
(
parent_pos
)]
+
1
)
child_positions
=
[
list
(
parent_pos
)
+
[
i
]
for
i
in
child_index_rng
]
child_positions
=
[
list
(
parent_pos
)
+
[
i
]
for
i
in
child_index_rng
]
children_to_remove
=
[
tree
[
tuple
(
child_pos
)]
for
child_pos
in
child_positions
]
children_to_remove
=
[
tree
[
tuple
(
child_pos
)]
for
child_pos
in
child_positions
]
for
child
in
children_to_remove
:
for
child
in
children_to_remove
:
...
@@ -268,27 +282,32 @@ def labelled_tree_str(tree_str, start, end):
...
@@ -268,27 +282,32 @@ def labelled_tree_str(tree_str, start, end):
return
str
(
tree
)
return
str
(
tree
)
def
prepare_annotated_reviews
():
def
prepare_annotated_reviews
():
reviews
=
parse
(
selected_reviews_location
)
reviews
=
parse
(
selected_reviews_location
)
root
=
reviews
.
getroot
()
root
=
reviews
.
getroot
()
annotated
=
[
review
for
review
in
root
if
review
.
attrib
[
'
annotated
'
]
==
'
true
'
]
annotated
=
[
review
for
review
in
root
if
review
.
attrib
[
'
annotated
'
]
==
'
true
'
]
prepared_root
=
Element
(
'
data
'
)
prepared_root
=
Element
(
'
sentences
'
)
for
review
in
annotated
:
for
review
in
annotated
:
for
sentence
in
review
.
find
(
'
sentences
'
):
for
sentence
in
review
.
find
(
'
sentences
'
):
text
=
sentence
.
find
(
'
text
'
).
text
text
=
sentence
.
find
(
'
text
'
).
text
tree_str
=
sentence
.
find
(
'
parse_tree
'
).
text
tree_str
=
sentence
.
find
(
'
parse_tree
'
).
text
annotations
=
sentence
.
find
(
'
annotations
'
)
if
sentence
.
find
(
'
annotations
'
)
else
[]
sentence_node
=
SubElement
(
prepared_root
,
'
sentence
'
)
for
annotation
in
annotations
:
text_node
=
SubElement
(
sentence_node
,
'
text
'
)
instance_node
=
SubElement
(
prepared_root
,
'
instance
'
)
text_node
.
text
=
text
text_node
=
SubElement
(
instance_node
,
'
text
'
)
text_node
.
text
=
text
if
sentence
.
find
(
'
annotations
'
):
op_node
=
SubElement
(
instance_node
,
'
opinion
'
)
aspect_terms_node
=
SubElement
(
sentence_node
,
'
aspectTerms
'
)
op_node
.
text
=
annotation
.
find
(
'
sentiment
'
).
text
tree_node
=
SubElement
(
instance_node
,
'
tree
'
)
for
annotation
in
sentence
.
find
(
'
annotations
'
):
start
,
end
=
annotation
.
find
(
'
range
'
).
text
.
split
(
'
,
'
)
start
,
end
=
annotation
.
find
(
'
range
'
).
text
.
split
(
'
,
'
)
tree_node
.
text
=
labelled_tree_str
(
tree_str
,
int
(
start
),
int
(
end
))
aspect_term_node
=
SubElement
(
aspect_terms_node
,
'
aspectTerm
'
)
aspect_term_node
.
set
(
'
term
'
,
text
[
start
:
end
])
aspect_term_node
.
set
(
'
polarity
'
,
annotation
.
find
(
'
sentiment
'
).
text
)
aspect_term_node
.
set
(
'
from
'
,
start
)
aspect_term_node
.
set
(
'
to
'
,
end
)
train_count
=
1000
train_count
=
1000
train_root
=
Element
(
'
data
'
)
train_root
=
Element
(
'
data
'
)
...
@@ -316,6 +335,7 @@ def prepare_annotated_reviews():
...
@@ -316,6 +335,7 @@ def prepare_annotated_reviews():
with
open
(
'
amazon_camera_test.xml
'
,
'
w
'
)
as
f
:
with
open
(
'
amazon_camera_test.xml
'
,
'
w
'
)
as
f
:
f
.
write
(
xmlstr
)
f
.
write
(
xmlstr
)
# prepare_reviews()
# prepare_reviews()
#
annotate_reviews()
annotate_reviews
()
prepare_annotated_reviews
()
#
prepare_annotated_reviews()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment