Commit 7ee763a7 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Fixed and implemented tree implementation of argument replacement. 1564/2469 opinions prepared.

parent 54d3f347
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
(NP (NN battery)))) (NP (NN battery))))
(X (: ...))))</tree></sentence> (X (: ...))))</tree></sentence>
<sentence id="79:3"> <sentence id="79:3">
<text>super fast processor and really nice graphics card..</text> <text>super fast processor and really nice graphics card...</text>
<Opinions> <Opinions>
<Opinion category="CPU#OPERATION_PERFORMANCE" polarity="positive" /> <Opinion category="CPU#OPERATION_PERFORMANCE" polarity="positive" />
<Opinion category="GRAPHICS#GENERAL" polarity="positive" /> <Opinion category="GRAPHICS#GENERAL" polarity="positive" />
...@@ -43,9 +43,9 @@ ...@@ -43,9 +43,9 @@
(NP (NP
(ADJP (RB really) (JJ nice)) (ADJP (RB really) (JJ nice))
(NNS graphics) (NN card))) (NNS graphics) (NN card)))
(. .)))</tree></sentence> (: ...)))</tree></sentence>
<sentence id="79:4"> <sentence id="79:4">
<text>and plenty of storage with 250 gb(though I will upgrade this and the ram..)</text> <text>and plenty of storage with 250 gb(though I will upgrade this and the ram...)</text>
<Opinions> <Opinions>
<Opinion category="HARD_DISC#DESIGN_FEATURES" polarity="positive" /> <Opinion category="HARD_DISC#DESIGN_FEATURES" polarity="positive" />
</Opinions> </Opinions>
...@@ -59,16 +59,17 @@ ...@@ -59,16 +59,17 @@
(PP (IN with) (PP (IN with)
(NP (CD 250) (NN gb)))) (NP (CD 250) (NN gb))))
(-LRB- -LRB-) (-LRB- -LRB-)
(SBAR (IN though) (FRAG
(S (SBAR (IN though)
(NP (PRP I)) (S
(VP (MD will) (NP (PRP I))
(VP (VP (MD will)
(VP (VB upgrade) (VP
(NP (DT this))) (VP (VB upgrade)
(CC and) (NP (DT this)))
(NP (DT the) (NN ram)))))) (CC and)
(. .)))</tree></sentence> (NP (DT the) (NN ram))))))
(: ...) (-RRB- -RRB-))))</tree></sentence>
<sentence id="79:5"> <sentence id="79:5">
<text>This computer is really fast and I'm shocked as to how easy it is to get used to...</text> <text>This computer is really fast and I'm shocked as to how easy it is to get used to...</text>
<Opinions> <Opinions>
...@@ -27591,7 +27592,7 @@ ...@@ -27591,7 +27592,7 @@
(ADVP (RB twice))))))))))) (ADVP (RB twice)))))))))))
(: ...))))</tree></sentence> (: ...))))</tree></sentence>
<sentence id="53:1"> <sentence id="53:1">
<text>I bought it from HSN because it was "bundled" with extra software, but as it turns out, that software just crashes it more often.....</text> <text>I bought it from HSN because it was "bundled" with extra software, but as it turns out, that software just crashes it more often...</text>
<Opinions> <Opinions>
<Opinion category="SOFTWARE#GENERAL" polarity="negative" /> <Opinion category="SOFTWARE#GENERAL" polarity="negative" />
<Opinion category="LAPTOP#DESIGN_FEATURES" polarity="neutral" /> <Opinion category="LAPTOP#DESIGN_FEATURES" polarity="neutral" />
...@@ -29474,7 +29475,7 @@ ...@@ -29474,7 +29475,7 @@
(ADJP (JJ excellent)))))) (ADJP (JJ excellent))))))
(: ...)))</tree></sentence> (: ...)))</tree></sentence>
<sentence id="68:4"> <sentence id="68:4">
<text>10 hours of battery life is really something else....</text> <text>10 hours of battery life is really something else...</text>
<Opinions> <Opinions>
<Opinion category="BATTERY#OPERATION_PERFORMANCE" polarity="positive" /> <Opinion category="BATTERY#OPERATION_PERFORMANCE" polarity="positive" />
</Opinions> </Opinions>
...@@ -29685,7 +29686,7 @@ ...@@ -29685,7 +29686,7 @@
(NN depanable))) (NN depanable)))
(. .)))</tree></sentence> (. .)))</tree></sentence>
<sentence id="72:4"> <sentence id="72:4">
<text>As a graphic arts a retired instructor I still love to play with the graphic with photos and clip art .....</text> <text>As a graphic arts a retired instructor I still love to play with the graphic with photos and clip art ...</text>
<Opinions> <Opinions>
<Opinion category="LAPTOP#MISCELLANEOUS" polarity="positive" /> <Opinion category="LAPTOP#MISCELLANEOUS" polarity="positive" />
</Opinions> </Opinions>
...@@ -35358,18 +35359,22 @@ ...@@ -35358,18 +35359,22 @@
<Review rid="B00KMRGF28_365_A2PNKW6SVO157M"> <Review rid="B00KMRGF28_365_A2PNKW6SVO157M">
<sentences> <sentences>
<sentence id="B00KMRGF28_365_A2PNKW6SVO157M:0"> <sentence id="B00KMRGF28_365_A2PNKW6SVO157M:0">
<text>If your on a budget.... Save up longer.</text> <text>If your on a budget... Save up longer.</text>
<Opinions> <Opinions>
<Opinion category="LAPTOP#GENERAL" polarity="negative" /> <Opinion category="LAPTOP#GENERAL" polarity="negative" />
</Opinions> </Opinions>
<tree>(ROOT <tree>(ROOT
(S (FRAG
(SBAR (IN If) (SBAR (IN If)
(S (S
(VP (PRP$ your) (NP (PRP$ your)
(PP (IN on) (S
(NP (DT a) (NN budget)))))) (PP (IN on)
(: ...) (. .)))</tree></sentence> (NP (DT a) (NN budget)))
(: ...)
(VP (VB Save)
(ADVP (RB up) (RB longer)))))))
(. .)))</tree></sentence>
<sentence id="B00KMRGF28_365_A2PNKW6SVO157M:1"> <sentence id="B00KMRGF28_365_A2PNKW6SVO157M:1">
<text>This laptop isn't worth a single cent.</text> <text>This laptop isn't worth a single cent.</text>
<Opinions> <Opinions>
...@@ -36437,7 +36442,7 @@ ...@@ -36437,7 +36442,7 @@
(NP (NN cable) (VBG connecting))))))))) (NP (NN cable) (VBG connecting)))))))))
(. .)))</tree></sentence> (. .)))</tree></sentence>
<sentence id="B00KMRGF28_163_A2D8S1LRNDVO7T:2"> <sentence id="B00KMRGF28_163_A2D8S1LRNDVO7T:2">
<text>just wifi....</text> <text>just wifi...</text>
<Opinions> <Opinions>
<Opinion category="LAPTOP#CONNECTIVITY" polarity="negative" /> <Opinion category="LAPTOP#CONNECTIVITY" polarity="negative" />
</Opinions> </Opinions>
...@@ -37090,16 +37095,26 @@ ...@@ -37090,16 +37095,26 @@
(NP (NN MS) (NNP Office))))))))))))))) (NP (NN MS) (NNP Office)))))))))))))))
(. .)))</tree></sentence> (. .)))</tree></sentence>
<sentence id="B00KMRGF28_213_A1HYE6WQREOY5C:5"> <sentence id="B00KMRGF28_213_A1HYE6WQREOY5C:5">
<text>the only drawback is no DVD player.. but for $20 you can buy an external one</text> <text>the only drawback is no DVD player... but for $20 you can buy an external one</text>
<Opinions> <Opinions>
<Opinion category="LAPTOP#DESIGN_FEATURES" polarity="neutral" /> <Opinion category="LAPTOP#DESIGN_FEATURES" polarity="neutral" />
</Opinions> </Opinions>
<tree>(ROOT <tree>(ROOT
(S (NP
(NP (DT the) (JJ only) (NN drawback)) (NP (DT the) (JJ only) (NN drawback))
(VP (VBZ is) (SBAR
(NP (DT no) (NN DVD) (NN player))) (S
(. .)))</tree></sentence> (VP (VBZ is)
(NP (DT no) (NN DVD) (NN player)))
(: ...)
(CC but)
(S
(PP (IN for)
(NP ($ $) (CD 20)
(NP (PRP you))))
(VP (MD can)
(VP (VB buy)
(NP (DT an) (JJ external) (CD one)))))))))</tree></sentence>
</sentences> </sentences>
</Review> </Review>
<Review rid="B00AH4A950_3_A20TCCBI6TXGYX"> <Review rid="B00AH4A950_3_A20TCCBI6TXGYX">
...@@ -38771,14 +38786,14 @@ ...@@ -38771,14 +38786,14 @@
<Review rid="B00KB3MXH4_10_A32154TIUMUEBZ"> <Review rid="B00KB3MXH4_10_A32154TIUMUEBZ">
<sentences> <sentences>
<sentence id="B00KB3MXH4_10_A32154TIUMUEBZ:0"> <sentence id="B00KB3MXH4_10_A32154TIUMUEBZ:0">
<text>Great labtop..</text> <text>Great labtop...</text>
<Opinions> <Opinions>
<Opinion category="LAPTOP#GENERAL" polarity="positive" /> <Opinion category="LAPTOP#GENERAL" polarity="positive" />
</Opinions> </Opinions>
<tree>(ROOT <tree>(ROOT
(NP (NP
(NP (JJ Great) (NN labtop)) (NP (JJ Great) (NN labtop))
(. .)))</tree></sentence> (: ...)))</tree></sentence>
<sentence id="B00KB3MXH4_10_A32154TIUMUEBZ:1"> <sentence id="B00KB3MXH4_10_A32154TIUMUEBZ:1">
<text>not as fast as I would have liked it to be but everything else is great!</text> <text>not as fast as I would have liked it to be but everything else is great!</text>
<Opinions> <Opinions>
...@@ -39143,7 +39158,7 @@ ...@@ -39143,7 +39158,7 @@
<Review rid="B00KMRGF28_47_A1K5HRS73WO4T6"> <Review rid="B00KMRGF28_47_A1K5HRS73WO4T6">
<sentences> <sentences>
<sentence id="B00KMRGF28_47_A1K5HRS73WO4T6:0"> <sentence id="B00KMRGF28_47_A1K5HRS73WO4T6:0">
<text>very poor quality lap top....</text> <text>very poor quality lap top...</text>
<Opinions> <Opinions>
<Opinion category="LAPTOP#QUALITY" polarity="negative" /> <Opinion category="LAPTOP#QUALITY" polarity="negative" />
</Opinions> </Opinions>
...@@ -47226,7 +47241,7 @@ ...@@ -47226,7 +47241,7 @@
<text>The reviews at the time in late July 2014 were mostly positive and it was the best selling laptop with almost 4 star review...</text> <text>The reviews at the time in late July 2014 were mostly positive and it was the best selling laptop with almost 4 star review...</text>
</sentence> </sentence>
<sentence id="B00KMRGF28_193_A20GPA6RDO8DT7:3"> <sentence id="B00KMRGF28_193_A20GPA6RDO8DT7:3">
<text>The first thing I can say for positive is that the laptop is very light- makes it really portable..this with a relatively clear screen is a bonus.</text> <text>The first thing I can say for positive is that the laptop is very light- makes it really portable...this with a relatively clear screen is a bonus.</text>
<Opinions> <Opinions>
<Opinion category="LAPTOP#DESIGN_FEATURES" polarity="positive" /> <Opinion category="LAPTOP#DESIGN_FEATURES" polarity="positive" />
<Opinion category="LAPTOP#PORTABILITY" polarity="positive" /> <Opinion category="LAPTOP#PORTABILITY" polarity="positive" />
...@@ -47256,6 +47271,16 @@ ...@@ -47256,6 +47271,16 @@
(S (S
(NP (PRP it)) (NP (PRP it))
(ADJP (RB really) (JJ portable))))) (ADJP (RB really) (JJ portable)))))
(: ...)
(S
(NP
(NP (DT this))
(PP (IN with)
(NP (DT a)
(ADJP (RB relatively) (JJ clear))
(NN screen))))
(VP (VBZ is)
(NP (DT a) (NN bonus))))
(. .)))</tree></sentence> (. .)))</tree></sentence>
<sentence id="B00KMRGF28_193_A20GPA6RDO8DT7:4"> <sentence id="B00KMRGF28_193_A20GPA6RDO8DT7:4">
<text>Now the graphics are not HD...and the reason that it is light is that it is really basic (medium low grade) plastic.</text> <text>Now the graphics are not HD...and the reason that it is light is that it is really basic (medium low grade) plastic.</text>
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from stanfordcorenlp import StanfordCoreNLP from stanfordcorenlp import StanfordCoreNLP
import re
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2.xml') tree = ET.parse('ABSA16_Laptops_Train_SB1_v2.xml')
reviews = tree.getroot() reviews = tree.getroot()
...@@ -13,8 +14,10 @@ for review in reviews: ...@@ -13,8 +14,10 @@ for review in reviews:
opinions = sentence.find('Opinions') opinions = sentence.find('Opinions')
if opinions is None: if opinions is None:
continue continue
text = sentence.find('text').text text_node = sentence.find('text')
parse_tree_str = nlp.parse(text) # replace all occurrences of two or more . with standardised ...
text_node.text = re.sub('[.][.]+', '...', text_node.text)
parse_tree_str = nlp.parse(text_node.text)
parse_tree = ET.Element('tree') parse_tree = ET.Element('tree')
parse_tree.text = parse_tree_str parse_tree.text = parse_tree_str
sentence.append(parse_tree) sentence.append(parse_tree)
......
...@@ -2,22 +2,31 @@ import xml.etree.ElementTree as ET ...@@ -2,22 +2,31 @@ import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
import string import string
from nltk.tree import Tree from nltk.tree import ParentedTree as Tree
included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD', 'FW'] tree = ET.parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
# Marks all subtrees with descriptive label with 'DESC' glossary = {
def get_np_words(np): 'laptop': ['computer', 'device', 'price', 'cost', 'macbook', 'mac', 'pc', 'speed', 'it', 'product'],
w = [] 'display': ['monitor', 'screen', 'touchscreen'],
for np_sub in np: 'cpu': ['processor'],
if type(np_sub) is Tree: 'hard disc': ['storage'],
if np_sub.label() not in included_labels: 'memory': ['ram'],
w += ['DESC'] 'power supply': ['charger', 'power supply cord', 'power adapter'],
else: 'keyboard': ['keys', 'numpad'],
w += get_np_words(np_sub) 'mouse': ['mouse pad', 'touchpad'],
else: 'fans cooling': ['fan', 'cooling', 'heat sink'],
return [np_sub] 'optical drives': ['cd players', 'dvd drive', 'disc drive', 'dvd burner'],
return w 'ports': ['usb port', 'hdmi port', 'vga port', 'card reader', 'firewire port', 'sd card slot', 'dvi port', 'thunderbolt port'],
'graphics': ['graphics card', 'video card', 'graphics chip', 'gpu'],
'multimedia devices': ['sound', 'audio', 'microphone', 'camera', 'webcam', 'speakers', 'headphone'],
'os': ['os x', 'windows', 'linux', 'start menu', 'safe mode', 'boot manager', 'drag and drop feature'],
'software': ['office', 'iwork', 'word processor', 'microsoft word', 'powerpoint', 'browser', 'skype', 'iphoto', 'ilife', 'pages', 'keynote', 'antivirus program', 'firewall', 'games', 'facial recognition'],
'company': ['apple', 'hp', 'asus', 'toshiba', 'dell', 'compaq', 'acer', 'lenovo']
}
included_labels = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP']
def get_np_tree(np): def get_np_tree(np):
children = [] children = []
...@@ -35,51 +44,15 @@ def get_np_tree(np): ...@@ -35,51 +44,15 @@ def get_np_tree(np):
children = [np_sub] + children children = [np_sub] + children
return (Tree(np.label(), children), True) return (Tree(np.label(), children), True)
def filtered_np(np): def extract_extended_nouns(tree):
w = get_np_words(np)
i = (len(w) - w[::-1].index('DESC')) if 'DESC' in w else 0
w = w[i:]
return ' '.join(w).replace('-LRB-', '(').replace('-RRB-', ')')
def extract_extended_nouns(tree_str):
phrases = [] phrases = []
trees = Tree.fromstring(tree_str) for subtree in tree.subtrees():
for tree in trees: if subtree.label() == 'NP':
for subtree in tree.subtrees(): np, _ = get_np_tree(subtree)
if subtree.label() == 'NP': if np:
np, _ = get_np_tree(subtree) phrases.append(np)
if np:
np = ' '.join(np.leaves())
phrases.append(np)
return phrases return phrases
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
glossary = {
'laptop': ['computer', 'price', 'cost', 'macbook'],
'display': ['monitor', 'screen'],
'cpu': ['processor'],
'hard disc': ['storage'],
'memory': ['ram'],
'power supply': ['charger', 'power supply cord', 'power adapter'],
'keyboard': ['keys', 'numpad'],
'mouse': ['mouse pad'],
'fans cooling': ['fan', 'cooling', 'heat sink'],
'optical drives': ['cd players', 'dvd drive', 'disc drive', 'dvd burner'],
'ports': ['usb port', 'hdmi port', 'vga port', 'card reader', 'firewire port', 'sd card slot', 'dvi port', 'thunderbolt port'],
'graphics': ['graphics card', 'video card', 'graphics chip', 'gpu'],
'multimedia devices': ['sound', 'audio', 'microphone', 'camera', 'webcam', 'speakers', 'headphone'],
'os': ['os x', 'windows', 'linux', 'start menu', 'safe mode', 'boot manager', 'drag and drop feature'],
'software': ['office', 'iwork', 'word processor', 'microsoft word', 'powerpoint', 'browser', 'skype', 'iphoto', 'ilife', 'pages', 'keynote', 'antivirus program', 'firewall', 'games', 'facial recognition']
}
n = len(reviews)
i = 0
prepped_opinions = 0
total_opinions = 0
def parse_opinion(opinion): def parse_opinion(opinion):
category = opinion.attrib['category'] category = opinion.attrib['category']
feature = category[:category.index("#")].lower() feature = category[:category.index("#")].lower()
...@@ -89,69 +62,59 @@ def parse_opinion(opinion): ...@@ -89,69 +62,59 @@ def parse_opinion(opinion):
def get_glossary(feature): def get_glossary(feature):
return [feature] + glossary[feature] if feature in glossary.keys() else [feature] return [feature] + glossary[feature] if feature in glossary.keys() else [feature]
# def replace_feature(feature, text): def node_value(node):
# tokens = word_tokenize(text) return node.label() if type(node) is Tree else node
# for i in range(len(tokens)):
# token = tokens[i] # true if tree2 is a subset of tree1 (with same init node)
# if token.lower() in get_glossary(feature): def tree_contains(tree1, tree2):
# text = text.replace(token, '$T$') return (node_value(tree1) == node_value(tree2) and
# return text if '$T$' in text else None (type(tree2) is not Tree or
(type(tree1) is Tree and
def replace_feature_nps(feature, text, nps): all(any(tree_contains(subtree1, subtree2) for subtree1 in tree1) for subtree2 in tree2))))
np_tokens = [word_tokenize(np) for np in nps]
detected_nps = set() def delete_identical_branches(tree1, node1, node2):
for np_i in range(len(nps)): if node2 == node1:
tokens = np_tokens[np_i] del tree1[node1.treeposition()]
for i in range(len(tokens)): elif type(node1) is Tree and type(node2) is Tree:
token = tokens[i] for subnode2 in node2:
for subnode1 in node1:
delete_identical_branches(tree1, subnode1, subnode2)
# tree2 in tree1 replaced by new leaf 'ARG'
def replace_np(tree1, tree2):
nodes = [tree1]
while len(nodes) > 0:
node = nodes.pop(0)
if type(node) is Tree:
nodes += [n for n in node]
if tree_contains(node, tree2):
node.insert(len(node), 'ARG')
delete_identical_branches(tree1, node, tree2)
break
def replace_feature_nps_tree(feature, parse_tree, np_trees):
np_matches = []
for np_tree in np_trees:
for token in np_tree.leaves():
if token.lower() in get_glossary(feature): if token.lower() in get_glossary(feature):
detected_nps.add(nps[np_i]) np_matches.append(np_tree)
if len(detected_nps) == 0: if len(np_matches) == 0:
return None return None
unique_nps = list(filter(lambda np: not any(other_np in np for other_np in detected_nps.difference({np})), detected_nps)) unique_nps = list(filter(lambda np: not any(tree_contains(np, other_np) for other_np in [x for x in np_matches if x != np]), np_matches))
modified_tree = parse_tree.copy(deep=True)
for unique_np in unique_nps: for unique_np in unique_nps:
if text == text.replace(unique_np, '$T$'): replace_np(modified_tree, unique_np)
print('***')
print(text)
print(nps)
print(detected_nps)
print(unique_nps)
text = text.replace(unique_np, '$T$')
return text
# tree_str = '''(ROOT
# (S
# (S
# (NP (PRP I))
# (VP (MD would)
# (VP (VB recommend)
# (NP (PRP it)))))
# (, ,)
# (PP (IN for)
# (NP
# (NP (NN anybody))
# (VP (VBG needing)
# (NP (DT a) (JJ reliable) (JJ simple) (NN laptop)))))
# (. .)))'''
# trees = Tree.fromstring(tree_str)
# for tree in trees:
# for subtree in tree.subtrees():
# if subtree.label() == 'NP':
# np, t = get_np_tree(subtree)
# print(np)
# print(t)
# # print(' '.join(np.leaves()))
#
# ns = filtered_np(subtree)
# print(ns)
#
# print('---')
assert parse_tree != modified_tree
return modified_tree
n = len(reviews)
i = 0
prepped_opinions = 0
total_opinions = 0
for review in reviews: for review in reviews:
sentences = review[0] sentences = review[0]
...@@ -162,24 +125,25 @@ for review in reviews: ...@@ -162,24 +125,25 @@ for review in reviews:
continue continue
opinions = set([parse_opinion(opinion) for opinion in opinions]) opinions = set([parse_opinion(opinion) for opinion in opinions])
text = sentence.find('text').text text = sentence.find('text').text
parse_tree = sentence.find('tree').text parse_tree_str = sentence.find('tree').text
parse_tree = Tree.fromstring(parse_tree_str)
nps = extract_extended_nouns(parse_tree) nps = extract_extended_nouns(parse_tree)
for opinion in opinions: for opinion in opinions:
total_opinions += 1 total_opinions += 1
replaced_text = replace_feature_nps(opinion[0], text, nps) modified_tree = replace_feature_nps_tree(opinion[0], parse_tree, nps)
if replaced_text: if modified_tree:
prepped_opinions += 1 prepped_opinions += 1
# print('---') # print('---')
# print(text) # print(text)
# print(replaced_text) # print(' '.join(modified_tree.leaves()))
# print(opinion) # print(opinion)
# print('---') # print('---')
else: else:
pass pass
# print('---') print('---')
# print(text) print(text)
# print(nps) print(nps)
# print(opinion) print(opinion)
i += 1 i += 1
print('{}/{}'.format(i, n)) print('{}/{}'.format(i, n))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment