Commit 8b1ab0c1 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Fixed some bugs in prep_data for SemEval data

parent 2c282c01
......@@ -7,8 +7,10 @@ glossary = {
'laptop': ['computer', 'device', 'machine', 'price', 'cost', 'macbook', 'mac', 'pc', 'speed', 'it', 'this', 'product'],
'display': ['monitor', 'screen', 'touchscreen'],
'cpu': ['processor'],
'motherboard': [],
'hard disc': ['storage'],
'memory': ['ram'],
'battery': ['battery life'],
'power supply': ['charger', 'power supply cord', 'power adapter'],
'keyboard': ['keys', 'numpad'],
'mouse': ['mouse pad', 'touchpad'],
......@@ -17,27 +19,38 @@ glossary = {
'ports': ['usb port', 'hdmi port', 'vga port', 'card reader', 'firewire port', 'sd card slot', 'dvi port', 'thunderbolt port'],
'graphics': ['graphics card', 'video card', 'graphics chip', 'gpu'],
'multimedia devices': ['sound', 'audio', 'microphone', 'camera', 'webcam', 'speakers', 'headphone'],
'hardware': [],
'os': ['os x', 'windows', 'linux', 'start menu', 'safe mode', 'boot manager', 'drag and drop feature'],
'software': ['office', 'iwork', 'word processor', 'microsoft word', 'powerpoint', 'browser', 'skype', 'iphoto', 'ilife', 'pages', 'keynote', 'antivirus program', 'firewall', 'games', 'facial recognition'],
'warranty': [],
'shipping': ['delivery'],
'support': ['service'],
'support': ['service', 'customer service'],
'company': ['apple', 'hp', 'asus', 'toshiba', 'dell', 'compaq', 'acer', 'lenovo']
}
included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP']
included_labels = ['NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP', '\'\'', '.']
noun_labels = ['NN', 'NNS']
def glossary_terms():
joint_terms = list(glossary.keys()) + [item for l in glossary.values() for item in l]
return ' '.join(joint_terms).split(' ')
def included_noun(t):
return (t.label() in noun_labels and
all(leaf.lower() in glossary_terms() for leaf in t.leaves()))
def get_np_tree(np):
children = []
for np_sub in reversed(np):
if type(np_sub) is Tree:
if np_sub.label() not in included_labels:
return (Tree(np.label(), children) if children else None, False)
else:
if np_sub.label() in included_labels or included_noun(np_sub):
subtree, cont = get_np_tree(np_sub)
assert subtree != None
children = [subtree] + children
if not cont:
return (Tree(np.label(), children), False)
else:
return (Tree(np.label(), children) if children else None, False)
else:
children = [np_sub] + children
return (Tree(np.label(), children), True)
......@@ -100,7 +113,7 @@ def replace_feature_nps_tree(feature, parse_tree, np_trees):
if len(np_matches) == 0:
return None
unique_nps = list(filter(lambda np: not any(tree_contains(np, other_np) for other_np in [x for x in np_matches if x != np]), np_matches))
unique_nps = np_matches # list(filter(lambda np: not any(tree_contains(np, other_np) for other_np in [x for x in np_matches if x != np]), np_matches))
modified_tree = parse_tree.copy(deep=True)
for unique_np in unique_nps:
......@@ -109,6 +122,46 @@ def replace_feature_nps_tree(feature, parse_tree, np_trees):
assert parse_tree != modified_tree
return modified_tree
# parse_tree_str = '''(ROOT
# (S
# (SBAR (RB Ever) (IN since)
# (S
# (NP (PRP I))
# (VP (VBD bought)
# (NP (DT this) (NN laptop)))))
# (, ,)
# (ADVP (RB so) (RB far))
# (NP (PRP I))
# (VP (VBP 've)
# (NP
# (NP (NN experience) (NN nothing))
# (CC but)
# (NP
# (NP (JJ constant) (NN break) (NNS downs))
# (PP (IN of)
# (NP
# (NP (DT the)
# (NP (NN laptop))
# (CC and)
# (NP (JJ bad) (NN customer) (NNS services)))
# (SBAR
# (S
# (NP (PRP I))
# (VP (VBD received)
# (PP (IN over)
# (NP (DT the) (NN phone)))
# (PP (IN with)
# (NP (NN toshiba) (NN customer) (NNS services) (NNS hotlines)))))))))))
# (. .)))'''
# parse_tree = Tree.fromstring(parse_tree_str)
# nps = extract_extended_nouns(parse_tree)
# # for n in nps:
# # print(n)
#
# mod = replace_feature_nps_tree('laptop', parse_tree, nps)
# print(mod)
tree = parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
......@@ -143,7 +196,7 @@ for review in reviews:
for opinion in opinions:
total_opinions_count += 1
modified_tree = replace_feature_nps_tree(opinion[0], parse_tree, nps)
if modified_tree and prepared_counts[opinion[1]] < 500:
if modified_tree and prepared_counts[opinion[1]] < 526:
opinion_trees.append(modified_tree)
prepared_opinions_count += 1
prepared_counts[opinion[1]] += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment