Commit 74d4793b authored by Joel Oksanen's avatar Joel Oksanen

Cleaned up files in .gitremove

parent c516942e
This diff is collapsed.
from stanfordcorenlp import StanfordCoreNLP
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from nltk.tree import ParentedTree as Tree
import re
from xml.dom import minidom
import os
filepath = 'Laptops_Test_Gold.xml'
output = 'SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
root = Element('data')
opinion_labels = ['negative', 'neutral', 'positive', 'conflict']
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0,
'conflict': 0
input = parse(filepath)
for sentence in input.getroot():
text = sentence.find('text').text
# replace all occurrences of two or more . with standardised ...
text = re.sub('[.][.]+', '...', text)
if not sentence.find('aspectTerms'):
for aspect_term in sentence.find('aspectTerms'):
arg_from = int(aspect_term.attrib['from'])
arg_to = int(aspect_term.attrib['to'])
opinion = aspect_term.attrib['polarity']
# get corenlp tree with argument in place
parse_tree_str = nlp.parse(text[:arg_from] + 'ARG' + text[arg_to:])
# replace argument with ARG in tree
parse_tree = Tree.fromstring(parse_tree_str)
# for subtree in parse_tree.subtrees():
# if ' '.join(subtree.leaves()) == argument:
# for child in list(subtree):
# subtree.remove(child)
# subtree.insert(0, 'ARG')
labelled_parse_tree_str = str(parse_tree)
instance_node = SubElement(root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
opinion_node = SubElement(instance_node, 'opinion')
opinion_node.text = opinion
opinion_tree_node = SubElement(instance_node, 'tree')
opinion_tree_node.text = labelled_parse_tree_str
prepared_counts[opinion] += 1
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(output, 'w') as f:
import xml.etree.ElementTree as ET
from stanfordcorenlp import StanfordCoreNLP
import re
tree = ET.parse('')
reviews = tree.getroot()
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
for review in reviews:
sentences = review[0]
assert sentences.tag == 'sentences'
for sentence in sentences:
opinions = sentence.find('Opinions')
if opinions is None:
text_node = sentence.find('text')
# replace all occurrences of two or more . with standardised ...
text_node.text = re.sub('[.][.]+', '...', text_node.text)
parse_tree_str = nlp.parse(text_node.text)
parse_tree = ET.Element('tree')
parse_tree.text = parse_tree_str
from xml.etree.ElementTree import ElementTree, parse, Element, SubElement
from nltk.tokenize import word_tokenize
import string
from nltk.tree import ParentedTree as Tree
glossary = {
'laptop': ['computer', 'device', 'machine', 'price', 'cost', 'macbook', 'mac', 'pc', 'speed', 'it', 'this', 'product'],
'display': ['monitor', 'screen', 'touchscreen'],
'cpu': ['processor'],
'motherboard': [],
'hard disc': ['storage'],
'memory': ['ram'],
'battery': ['battery life'],
'power supply': ['charger', 'power supply cord', 'power adapter'],
'keyboard': ['keys', 'numpad'],
'mouse': ['mouse pad', 'touchpad'],
'fans cooling': ['fan', 'cooling', 'heat sink'],
'optical drives': ['cd players', 'dvd drive', 'disc drive', 'dvd burner'],
'ports': ['usb port', 'hdmi port', 'vga port', 'card reader', 'firewire port', 'sd card slot', 'dvi port', 'thunderbolt port'],
'graphics': ['graphics card', 'video card', 'graphics chip', 'gpu'],
'multimedia devices': ['sound', 'audio', 'microphone', 'camera', 'webcam', 'speakers', 'headphone'],
'hardware': [],
'os': ['os x', 'windows', 'linux', 'start menu', 'safe mode', 'boot manager', 'drag and drop feature'],
'software': ['office', 'iwork', 'word processor', 'microsoft word', 'powerpoint', 'browser', 'skype', 'iphoto', 'ilife', 'pages', 'keynote', 'antivirus program', 'firewall', 'games', 'facial recognition'],
'warranty': [],
'shipping': ['delivery'],
'support': ['service', 'customer service'],
'company': ['apple', 'hp', 'asus', 'toshiba', 'dell', 'compaq', 'acer', 'lenovo']
included_labels = ['NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP', '\'\'', '.']
noun_labels = ['NN', 'NNS']
def glossary_terms():
joint_terms = list(glossary.keys()) + [item for l in glossary.values() for item in l]
return ' '.join(joint_terms).split(' ')
def included_noun(t):
return (t.label() in noun_labels and
all(leaf.lower() in glossary_terms() for leaf in t.leaves()))
def get_np_tree(np):
children = []
for np_sub in reversed(np):
if type(np_sub) is Tree:
if np_sub.label() in included_labels or included_noun(np_sub):
subtree, cont = get_np_tree(np_sub)
assert subtree != None
children = [subtree] + children
if not cont:
return (Tree(np.label(), children), False)
return (Tree(np.label(), children) if children else None, False)
children = [np_sub] + children
return (Tree(np.label(), children), True)
def extract_extended_nouns(tree):
phrases = []
for subtree in tree.subtrees():
if subtree.label() == 'NP':
np, _ = get_np_tree(subtree)
if np:
return phrases
def parse_opinion(opinion):
category = opinion.attrib['category']
feature = category[:category.index("#")].lower()
polarity = opinion.attrib['polarity']
return (feature.replace('_', ' '), polarity)
def get_glossary(feature):
return [feature] + glossary[feature] if feature in glossary.keys() else [feature]
def node_value(node):
return node.label() if type(node) is Tree else node
# true if tree2 is a subset of tree1 (with same init node)
def tree_contains(tree1, tree2):
return (node_value(tree1) == node_value(tree2) and
(type(tree2) is not Tree or
(type(tree1) is Tree and
all(any(tree_contains(subtree1, subtree2) for subtree1 in tree1) for subtree2 in tree2))))
def delete_identical_branches(tree1, node1, node2):
if node2 == node1:
del tree1[node1.treeposition()]
elif type(node1) is Tree and type(node2) is Tree:
for subnode2 in node2:
for subnode1 in node1:
delete_identical_branches(tree1, subnode1, subnode2)
# tree2 in tree1 replaced by new leaf 'ARG'
def replace_np(tree1, tree2):
nodes = [tree1]
while len(nodes) > 0:
node = nodes.pop(0)
if type(node) is Tree:
nodes += [n for n in node]
if tree_contains(node, tree2):
node.insert(len(node), 'ARG')
delete_identical_branches(tree1, node, tree2)
def replace_feature_nps_tree(feature, parse_tree, np_trees):
np_matches = []
for np_tree in np_trees:
for token in np_tree.leaves():
if token.lower() in get_glossary(feature):
if len(np_matches) == 0:
return None
unique_nps = np_matches # list(filter(lambda np: not any(tree_contains(np, other_np) for other_np in [x for x in np_matches if x != np]), np_matches))
modified_tree = parse_tree.copy(deep=True)
for unique_np in unique_nps:
replace_np(modified_tree, unique_np)
assert parse_tree != modified_tree
return modified_tree
# parse_tree_str = '''(ROOT
# (S
# (SBAR (RB Ever) (IN since)
# (S
# (NP (PRP I))
# (VP (VBD bought)
# (NP (DT this) (NN laptop)))))
# (, ,)
# (ADVP (RB so) (RB far))
# (NP (PRP I))
# (VP (VBP 've)
# (NP
# (NP (NN experience) (NN nothing))
# (CC but)
# (NP
# (NP (JJ constant) (NN break) (NNS downs))
# (PP (IN of)
# (NP
# (NP (DT the)
# (NP (NN laptop))
# (CC and)
# (NP (JJ bad) (NN customer) (NNS services)))
# (S
# (NP (PRP I))
# (VP (VBD received)
# (PP (IN over)
# (NP (DT the) (NN phone)))
# (PP (IN with)
# (NP (NN toshiba) (NN customer) (NNS services) (NNS hotlines)))))))))))
# (. .)))'''
# parse_tree = Tree.fromstring(parse_tree_str)
# nps = extract_extended_nouns(parse_tree)
# # for n in nps:
# # print(n)
# mod = replace_feature_nps_tree('laptop', parse_tree, nps)
# print(mod)
tree = parse('ABSA16_Laptops_Test_with_parse_trees.xml')
reviews = tree.getroot()
n = len(reviews)
i = 0
prepared_opinions_count = 0
total_opinions_count = 0
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0
train_root = Element('data')
for review in reviews:
sentences = review[0]
assert sentences.tag == 'sentences'
for sentence in sentences:
opinions = sentence.find('Opinions')
if opinions is None:
opinions = set([parse_opinion(opinion) for opinion in opinions])
text = sentence.find('text').text
parse_tree_str = sentence.find('tree').text
parse_tree = Tree.fromstring(parse_tree_str)
nps = extract_extended_nouns(parse_tree)
opinion_trees = []
# attempt to identify opinion target in sentence
for opinion in opinions:
total_opinions_count += 1
modified_tree = replace_feature_nps_tree(opinion[0], parse_tree, nps)
if modified_tree and prepared_counts[opinion[1]] < 526:
prepared_opinions_count += 1
prepared_counts[opinion[1]] += 1
print('review text:')
print('modified text:')
print(' '.join(modified_tree.leaves()))
print('labelled opinion:')
# store in new train_tree
instance_node = SubElement(train_root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
opinion_node = SubElement(instance_node, 'opinion')
opinion_node.text = opinion[1]
opinion_tree_node = SubElement(instance_node, 'tree')
opinion_tree_node.text = str(modified_tree)
# print('---')
# print(text)
# print(nps)
# print(opinion)
i += 1
print('{}/{}'.format(i, n))
train_tree = ElementTree(train_root)
print('{}/{} opinions prepared'.format(prepared_opinions_count, total_opinions_count))
This diff is collapsed.