From 54d3f347875ac27336212d40ae7b500417caf606 Mon Sep 17 00:00:00 2001 From: Joel Oksanen <joel.oksanen17@imperial.ac.uk> Date: Wed, 25 Mar 2020 20:22:41 +0200 Subject: [PATCH] Noun phrase parsing now returns tree instead of string. TODO: Use the tree in opinion target replacement --- .../SemEval-2016/prep_data.py | 73 ++++++++++++++++--- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/ADA/aspect-based SA data/SemEval-2016/prep_data.py b/ADA/aspect-based SA data/SemEval-2016/prep_data.py index a7e93a4..0c8133c 100644 --- a/ADA/aspect-based SA data/SemEval-2016/prep_data.py +++ b/ADA/aspect-based SA data/SemEval-2016/prep_data.py @@ -4,7 +4,7 @@ import string from nltk.tree import Tree -included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD'] +included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD', 'FW'] # Marks all subtrees with descriptive label with 'DESC' def get_np_words(np): @@ -19,6 +19,22 @@ def get_np_words(np): return [np_sub] return w +def get_np_tree(np): + children = [] + for np_sub in reversed(np): + if type(np_sub) is Tree: + if np_sub.label() not in included_labels: + return (Tree(np.label(), children) if children else None, False) + else: + subtree, cont = get_np_tree(np_sub) + assert subtree != None + children = [subtree] + children + if not cont: + return (Tree(np.label(), children), False) + else: + children = [np_sub] + children + return (Tree(np.label(), children), True) + def filtered_np(np): w = get_np_words(np) i = (len(w) - w[::-1].index('DESC')) if 'DESC' in w else 0 @@ -31,8 +47,10 @@ def extract_extended_nouns(tree_str): for tree in trees: for subtree in tree.subtrees(): if subtree.label() == 'NP': - # np = ' '.join(np.leaves()) - phrases.append(filtered_np(subtree)) + np, _ = get_np_tree(subtree) + if np: + np = ' '.join(np.leaves()) + phrases.append(np) return phrases tree = ET.parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml') @@ -92,16 +110,49 @@ def replace_feature_nps(feature, text, nps): if len(detected_nps) == 0: return None - print(nps) - print(detected_nps) unique_nps = list(filter(lambda np: not any(other_np in np for other_np in detected_nps.difference({np})), detected_nps)) - print(unique_nps) for unique_np in unique_nps: + if text == text.replace(unique_np, '$T$'): + print('***') + print(text) + print(nps) + print(detected_nps) + print(unique_nps) text = text.replace(unique_np, '$T$') return text +# tree_str = '''(ROOT +# (S +# (S +# (NP (PRP I)) +# (VP (MD would) +# (VP (VB recommend) +# (NP (PRP it))))) +# (, ,) +# (PP (IN for) +# (NP +# (NP (NN anybody)) +# (VP (VBG needing) +# (NP (DT a) (JJ reliable) (JJ simple) (NN laptop))))) +# (. .)))''' +# trees = Tree.fromstring(tree_str) +# for tree in trees: +# for subtree in tree.subtrees(): +# if subtree.label() == 'NP': +# np, t = get_np_tree(subtree) +# print(np) +# print(t) +# # print(' '.join(np.leaves())) +# +# ns = filtered_np(subtree) +# print(ns) +# +# print('---') + + + for review in reviews: sentences = review[0] assert sentences.tag == 'sentences' @@ -118,11 +169,11 @@ for review in reviews: replaced_text = replace_feature_nps(opinion[0], text, nps) if replaced_text: prepped_opinions += 1 - print('---') - print(text) - print(replaced_text) - print(opinion) - print('---') + # print('---') + # print(text) + # print(replaced_text) + # print(opinion) + # print('---') else: pass # print('---') -- GitLab