Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
29ed5986
Commit
29ed5986
authored
May 18, 2020
by
Joel Oksanen
Browse files
Implemented entity annotation
parent
cbd56512
Changes
2
Hide whitespace changes
Inline
Side-by-side
ADA/server/agent/
review
_annotation.py
→
ADA/server/agent/
sentiment
_annotation.py
View file @
29ed5986
File moved
ADA/server/agent/target_extraction/entity_annotation.py
0 → 100644
View file @
29ed5986
import
pandas
as
pd
from
xml.etree.ElementTree
import
ElementTree
,
parse
,
tostring
,
Element
,
SubElement
from
gensim.models.phrases
import
Phrases
,
Phraser
from
nltk
import
pos_tag
from
nltk.tokenize
import
word_tokenize
,
sent_tokenize
import
string
from
nltk.corpus
import
stopwords
import
re
from
collections
import
Counter
import
pickle
import
os
import
readchar
from
sty
import
fg
,
bg
from
anytree
import
Node
,
RenderTree
,
LevelOrderIter
,
PreOrderIter
PHRASE_THRESHOLD
=
4
ROW_CHARACTER_COUNT
=
100
stop_words
=
stopwords
.
words
(
'english'
)
ann_bgs
=
[
bg
.
blue
,
bg
.
red
]
# child, parent
class
EntityAnnotator
:
def
__init__
(
self
,
text_file_path
,
counter
,
save_path
):
self
.
text_file_path
=
text_file_path
self
.
counter
=
counter
self
.
save_path
=
save_path
self
.
root
=
None
self
.
n_annotated
=
0
@
staticmethod
def
new_from_tsv
(
file_path
,
name
):
df
=
pd
.
read_csv
(
file_path
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
texts
=
[
text
.
replace
(
'_'
,
' '
)
for
_
,
par
in
df
[
'reviewText'
].
items
()
if
not
pd
.
isnull
(
par
)
for
text
in
sent_tokenize
(
par
)]
counter
=
EntityAnnotator
.
count_nouns
(
texts
)
ann
=
EntityAnnotator
(
file_path
,
counter
,
name
+
'.pickle'
)
return
ann
@
staticmethod
def
load_saved
(
file_path
):
f
=
open
(
file_path
,
'rb'
)
ann
=
pickle
.
load
(
f
)
f
.
close
()
return
ann
def
save
(
self
):
f
=
open
(
self
.
save_path
,
'wb'
)
pickle
.
dump
(
self
,
f
)
f
.
close
()
@
staticmethod
def
count_nouns
(
texts
):
# obtain phraser
bigram
=
Phrases
(
texts
,
threshold
=
PHRASE_THRESHOLD
)
trigram
=
Phrases
(
bigram
[
texts
],
threshold
=
PHRASE_THRESHOLD
)
phraser
=
Phraser
(
trigram
)
# count nouns
nouns
=
[]
for
text
in
texts
:
pos_tags
=
pos_tag
(
text
)
ngrams
=
phraser
[
text
]
word_idx
=
0
for
token
in
ngrams
:
if
'_'
in
token
:
words
=
token
.
split
(
'_'
)
word_range
=
range
(
word_idx
,
word_idx
+
len
(
words
))
has_noun
=
any
(
EntityAnnotator
.
is_noun
(
pos_tags
[
i
])
for
i
in
word_range
)
all_terms_valid
=
all
(
EntityAnnotator
.
is_valid_term
(
pos_tags
[
i
])
for
i
in
word_range
)
if
has_noun
and
all_terms_valid
:
nouns
.
append
(
token
)
word_idx
+=
len
(
words
)
else
:
is_noun
=
EntityAnnotator
.
is_noun
(
pos_tags
[
word_idx
])
is_valid
=
EntityAnnotator
.
is_valid_term
(
pos_tags
[
word_idx
])
if
len
(
token
)
>
1
and
is_noun
and
is_valid
:
nouns
.
append
(
token
)
word_idx
+=
1
return
Counter
(
nouns
)
@
staticmethod
def
is_noun
(
pos_tagged
):
word
,
tag
=
pos_tagged
return
tag
.
startswith
(
'NN'
)
and
word
.
lower
()
not
in
string
.
punctuation
and
word
not
in
stop_words
# true if term is not a preposition and does not include special characters
@
staticmethod
def
is_valid_term
(
pos_tagged
):
alpha_numeric_pat
=
'^\w+$'
word
,
tag
=
pos_tagged
return
tag
!=
'IN'
and
re
.
match
(
alpha_numeric_pat
,
word
)
def
annotate
(
self
):
while
True
:
entity
=
self
.
select_entity
()
os
.
system
(
'clear'
)
print
(
fg
.
li_blue
+
'{} entities annotated'
.
format
(
self
.
n_annotated
)
+
fg
.
rs
)
print
(
''
)
print
(
fg
.
li_black
+
'root:
\'
r
\'
'
+
fg
.
rs
)
print
(
fg
.
li_black
+
'subfeat: [number of parent node][ENTER]'
+
fg
.
rs
)
print
(
fg
.
li_black
+
'skip:
\'
s
\'
'
+
fg
.
rs
)
print
(
fg
.
li_black
+
'quit:
\'
q
\'
'
+
fg
.
rs
)
print
(
''
)
if
self
.
root
is
not
None
:
print
(
RenderTree
(
self
.
root
))
print
(
''
)
print
(
entity
)
task
=
readchar
.
readkey
()
if
task
==
'r'
:
old_root
=
self
.
root
self
.
root
=
Node
(
entity
)
old_root
.
parent
=
self
.
root
self
.
update_tree_indices
()
self
.
n_annotated
+=
1
if
task
.
isdigit
():
n
=
int
(
task
)
while
True
:
subtask
=
readchar
.
readkey
()
if
subtask
.
isdigit
():
n
=
n
*
10
+
int
(
subtask
)
if
subtask
==
readchar
.
key
.
ENTER
:
Node
(
entity
,
parent
=
self
.
node_with_number
(
n
))
self
.
n_annotated
+=
1
break
if
task
==
's'
:
self
.
n_annotated
+=
1
if
task
==
'q'
:
break
self
.
save
()
def
select_entity
(
self
):
entity
=
self
.
counter
.
most_common
()[
self
.
n_annotated
]
return
entity
.
replace
(
'_'
,
' '
)
def
node_with_number
(
self
,
n
):
return
list
(
LevelOrderIter
(
self
.
root
))[
n
]
def
update_tree_indices
(
self
):
i
=
0
for
node
in
LevelOrderIter
(
self
.
root
):
node
.
n
=
i
i
+=
1
# def get_relation_tuples(self):
# rels = []
# for e1 in LevelOrderIter(self.root):
# if e1.isleaf():
# continue
# for e2 in e1.children:
# rels.append((e1.name, e2.name)) # e1 hasFeature e2
# return rels
def
get_annotated_texts
(
self
,
save_path
):
df
=
pd
.
read_csv
(
self
.
text_file_path
,
sep
=
'
\t
'
,
error_bad_lines
=
False
)
df
[
'relations'
]
=
df
[
'reviewText'
].
apply
(
lambda
t
:
self
.
relations_for_text
(
t
))
df
=
df
[
~
df
[
'relations'
].
isnull
()]
df
.
to_csv
(
save_path
,
sep
=
'
\t
'
,
index
=
False
)
def
relations_for_text
(
self
,
text
):
rels
=
[]
child_entities
=
[]
for
e1
in
PreOrderIter
(
self
.
root
):
if
not
e1
.
isleaf
()
and
e1
.
name
in
text
:
for
e2
in
e1
.
children
:
if
e2
.
name
in
text
:
# e1 is a parent of an entity in the text
if
e1
in
child_entities
:
# e1 cannot be a parent and a child
return
None
rels
.
append
({
'em1Text'
:
e1
,
'em2Text'
:
e2
,
'label'
:
'/has_feature'
})
child_entities
.
append
(
e2
)
return
rels
ann
=
EntityAnnotator
.
new_from_tsv
(
'data/verified_camera_reviews.tsv'
,
'camera_entity_annotator'
)
ann
.
annotate
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment