From 54d3f347875ac27336212d40ae7b500417caf606 Mon Sep 17 00:00:00 2001
From: Joel Oksanen <joel.oksanen17@imperial.ac.uk>
Date: Wed, 25 Mar 2020 20:22:41 +0200
Subject: [PATCH] Noun phrase parsing now returns tree instead of string. TODO:
 Use the tree in opinion target replacement

---
 .../SemEval-2016/prep_data.py                 | 73 ++++++++++++++++---
 1 file changed, 62 insertions(+), 11 deletions(-)

diff --git a/ADA/aspect-based SA data/SemEval-2016/prep_data.py b/ADA/aspect-based SA data/SemEval-2016/prep_data.py
index a7e93a4..0c8133c 100644
--- a/ADA/aspect-based SA data/SemEval-2016/prep_data.py	
+++ b/ADA/aspect-based SA data/SemEval-2016/prep_data.py	
@@ -4,7 +4,7 @@ import string
 
 from nltk.tree import Tree
 
-included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD']
+included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD', 'FW']
 
 # Marks all subtrees with descriptive label with 'DESC'
 def get_np_words(np):
@@ -19,6 +19,22 @@ def get_np_words(np):
             return [np_sub]
     return w
 
+def get_np_tree(np):
+    children = []
+    for np_sub in reversed(np):
+        if type(np_sub) is Tree:
+            if np_sub.label() not in included_labels:
+                return (Tree(np.label(), children) if children else None, False)
+            else:
+                subtree, cont = get_np_tree(np_sub)
+                assert subtree != None
+                children = [subtree] + children
+                if not cont:
+                    return (Tree(np.label(), children), False)
+        else:
+            children = [np_sub] + children
+    return (Tree(np.label(), children), True)
+
 def filtered_np(np):
     w = get_np_words(np)
     i = (len(w) - w[::-1].index('DESC')) if 'DESC' in w else 0
@@ -31,8 +47,10 @@ def extract_extended_nouns(tree_str):
     for tree in trees:
         for subtree in tree.subtrees():
             if subtree.label() == 'NP':
-                # np = ' '.join(np.leaves())
-                phrases.append(filtered_np(subtree))
+                np, _ = get_np_tree(subtree)
+                if np:
+                    np = ' '.join(np.leaves())
+                    phrases.append(np)
     return phrases
 
 tree = ET.parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
@@ -92,16 +110,49 @@ def replace_feature_nps(feature, text, nps):
     if len(detected_nps) == 0:
         return None
 
-    print(nps)
-    print(detected_nps)
     unique_nps = list(filter(lambda np: not any(other_np in np for other_np in detected_nps.difference({np})), detected_nps))
-    print(unique_nps)
 
     for unique_np in unique_nps:
+        if text == text.replace(unique_np, '$T$'):
+            print('***')
+            print(text)
+            print(nps)
+            print(detected_nps)
+            print(unique_nps)
         text = text.replace(unique_np, '$T$')
 
     return text
 
+# tree_str = '''(ROOT
+# (S
+# (S
+# (NP (PRP I))
+# (VP (MD would)
+# (VP (VB recommend)
+# (NP (PRP it)))))
+# (, ,)
+# (PP (IN for)
+# (NP
+# (NP (NN anybody))
+# (VP (VBG needing)
+# (NP (DT a) (JJ reliable) (JJ simple) (NN laptop)))))
+# (. .)))'''
+# trees = Tree.fromstring(tree_str)
+# for tree in trees:
+#     for subtree in tree.subtrees():
+#         if subtree.label() == 'NP':
+#             np, t = get_np_tree(subtree)
+#             print(np)
+#             print(t)
+#             # print(' '.join(np.leaves()))
+#
+#             ns = filtered_np(subtree)
+#             print(ns)
+#
+#             print('---')
+
+
+
 for review in reviews:
     sentences = review[0]
     assert sentences.tag == 'sentences'
@@ -118,11 +169,11 @@ for review in reviews:
             replaced_text = replace_feature_nps(opinion[0], text, nps)
             if replaced_text:
                 prepped_opinions += 1
-                print('---')
-                print(text)
-                print(replaced_text)
-                print(opinion)
-                print('---')
+                # print('---')
+                # print(text)
+                # print(replaced_text)
+                # print(opinion)
+                # print('---')
             else:
                 pass
                 # print('---')
-- 
GitLab