Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Joel Oksanen
individual_project
Commits
8b1ab0c1
Commit
8b1ab0c1
authored
Apr 04, 2020
by
Joel Oksanen
Browse files
Fixed some bugs in prep_data for SemEval data
parent
2c282c01
Changes
2
Expand all
Show whitespace changes
Inline
Side-by-side
ADA/SA/data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml
View file @
8b1ab0c1
This diff is collapsed.
Click to expand it.
ADA/SA/data/SemEval-2016/prep_data.py
View file @
8b1ab0c1
...
...
@@ -7,8 +7,10 @@ glossary = {
'laptop'
:
[
'computer'
,
'device'
,
'machine'
,
'price'
,
'cost'
,
'macbook'
,
'mac'
,
'pc'
,
'speed'
,
'it'
,
'this'
,
'product'
],
'display'
:
[
'monitor'
,
'screen'
,
'touchscreen'
],
'cpu'
:
[
'processor'
],
'motherboard'
:
[],
'hard disc'
:
[
'storage'
],
'memory'
:
[
'ram'
],
'battery'
:
[
'battery life'
],
'power supply'
:
[
'charger'
,
'power supply cord'
,
'power adapter'
],
'keyboard'
:
[
'keys'
,
'numpad'
],
'mouse'
:
[
'mouse pad'
,
'touchpad'
],
...
...
@@ -17,27 +19,38 @@ glossary = {
'ports'
:
[
'usb port'
,
'hdmi port'
,
'vga port'
,
'card reader'
,
'firewire port'
,
'sd card slot'
,
'dvi port'
,
'thunderbolt port'
],
'graphics'
:
[
'graphics card'
,
'video card'
,
'graphics chip'
,
'gpu'
],
'multimedia devices'
:
[
'sound'
,
'audio'
,
'microphone'
,
'camera'
,
'webcam'
,
'speakers'
,
'headphone'
],
'hardware'
:
[],
'os'
:
[
'os x'
,
'windows'
,
'linux'
,
'start menu'
,
'safe mode'
,
'boot manager'
,
'drag and drop feature'
],
'software'
:
[
'office'
,
'iwork'
,
'word processor'
,
'microsoft word'
,
'powerpoint'
,
'browser'
,
'skype'
,
'iphoto'
,
'ilife'
,
'pages'
,
'keynote'
,
'antivirus program'
,
'firewall'
,
'games'
,
'facial recognition'
],
'warranty'
:
[],
'shipping'
:
[
'delivery'
],
'support'
:
[
'service'
],
'support'
:
[
'
service'
,
'customer
service'
],
'company'
:
[
'apple'
,
'hp'
,
'asus'
,
'toshiba'
,
'dell'
,
'compaq'
,
'acer'
,
'lenovo'
]
}
included_labels
=
[
'NN'
,
'NNS'
,
'NNP'
,
'NNPS'
,
'DT'
,
'CD'
,
'FW'
,
'PRP'
]
included_labels
=
[
'NNP'
,
'NNPS'
,
'DT'
,
'CD'
,
'FW'
,
'PRP'
,
'
\'\'
'
,
'.'
]
noun_labels
=
[
'NN'
,
'NNS'
]
def
glossary_terms
():
joint_terms
=
list
(
glossary
.
keys
())
+
[
item
for
l
in
glossary
.
values
()
for
item
in
l
]
return
' '
.
join
(
joint_terms
).
split
(
' '
)
def
included_noun
(
t
):
return
(
t
.
label
()
in
noun_labels
and
all
(
leaf
.
lower
()
in
glossary_terms
()
for
leaf
in
t
.
leaves
()))
def
get_np_tree
(
np
):
children
=
[]
for
np_sub
in
reversed
(
np
):
if
type
(
np_sub
)
is
Tree
:
if
np_sub
.
label
()
not
in
included_labels
:
return
(
Tree
(
np
.
label
(),
children
)
if
children
else
None
,
False
)
else
:
if
np_sub
.
label
()
in
included_labels
or
included_noun
(
np_sub
):
subtree
,
cont
=
get_np_tree
(
np_sub
)
assert
subtree
!=
None
children
=
[
subtree
]
+
children
if
not
cont
:
return
(
Tree
(
np
.
label
(),
children
),
False
)
else
:
return
(
Tree
(
np
.
label
(),
children
)
if
children
else
None
,
False
)
else
:
children
=
[
np_sub
]
+
children
return
(
Tree
(
np
.
label
(),
children
),
True
)
...
...
@@ -100,7 +113,7 @@ def replace_feature_nps_tree(feature, parse_tree, np_trees):
if
len
(
np_matches
)
==
0
:
return
None
unique_nps
=
list
(
filter
(
lambda
np
:
not
any
(
tree_contains
(
np
,
other_np
)
for
other_np
in
[
x
for
x
in
np_matches
if
x
!=
np
]),
np_matches
))
unique_nps
=
np_matches
#
list(filter(lambda np: not any(tree_contains(np, other_np) for other_np in [x for x in np_matches if x != np]), np_matches))
modified_tree
=
parse_tree
.
copy
(
deep
=
True
)
for
unique_np
in
unique_nps
:
...
...
@@ -109,6 +122,46 @@ def replace_feature_nps_tree(feature, parse_tree, np_trees):
assert
parse_tree
!=
modified_tree
return
modified_tree
# parse_tree_str = '''(ROOT
# (S
# (SBAR (RB Ever) (IN since)
# (S
# (NP (PRP I))
# (VP (VBD bought)
# (NP (DT this) (NN laptop)))))
# (, ,)
# (ADVP (RB so) (RB far))
# (NP (PRP I))
# (VP (VBP 've)
# (NP
# (NP (NN experience) (NN nothing))
# (CC but)
# (NP
# (NP (JJ constant) (NN break) (NNS downs))
# (PP (IN of)
# (NP
# (NP (DT the)
# (NP (NN laptop))
# (CC and)
# (NP (JJ bad) (NN customer) (NNS services)))
# (SBAR
# (S
# (NP (PRP I))
# (VP (VBD received)
# (PP (IN over)
# (NP (DT the) (NN phone)))
# (PP (IN with)
# (NP (NN toshiba) (NN customer) (NNS services) (NNS hotlines)))))))))))
# (. .)))'''
# parse_tree = Tree.fromstring(parse_tree_str)
# nps = extract_extended_nouns(parse_tree)
# # for n in nps:
# # print(n)
#
# mod = replace_feature_nps_tree('laptop', parse_tree, nps)
# print(mod)
tree
=
parse
(
'ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml'
)
reviews
=
tree
.
getroot
()
...
...
@@ -143,7 +196,7 @@ for review in reviews:
for
opinion
in
opinions
:
total_opinions_count
+=
1
modified_tree
=
replace_feature_nps_tree
(
opinion
[
0
],
parse_tree
,
nps
)
if
modified_tree
and
prepared_counts
[
opinion
[
1
]]
<
5
00
:
if
modified_tree
and
prepared_counts
[
opinion
[
1
]]
<
5
26
:
opinion_trees
.
append
(
modified_tree
)
prepared_opinions_count
+=
1
prepared_counts
[
opinion
[
1
]]
+=
1
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment