prep_data.py 2.12 KB
Newer Older
 Joel  Oksanen's avatar
Joel Oksanen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from stanfordcorenlp import StanfordCoreNLP
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from nltk.tree import ParentedTree as Tree
import re
from xml.dom import minidom
import os

filepath = 'Laptops_Test_Gold.xml'
output = 'SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')

root = Element('data')

opinion_labels = ['negative', 'neutral', 'positive', 'conflict']

prepared_counts = {
    'positive': 0,
    'neutral': 0,
    'negative': 0,
    'conflict': 0
}

input = parse(filepath)

for sentence in input.getroot():
    text = sentence.find('text').text
    # replace all occurrences of two or more . with standardised ...
    text = re.sub('[.][.]+', '...', text)

    if not sentence.find('aspectTerms'):
        continue

    for aspect_term in sentence.find('aspectTerms'):
        arg_from = int(aspect_term.attrib['from'])
        arg_to = int(aspect_term.attrib['to'])
        opinion = aspect_term.attrib['polarity']

        # get corenlp tree with argument in place
        parse_tree_str = nlp.parse(text[:arg_from] + 'ARG' + text[arg_to:])
        # replace argument with ARG in tree
        parse_tree = Tree.fromstring(parse_tree_str)
        # for subtree in parse_tree.subtrees():
        #     if ' '.join(subtree.leaves()) == argument:
        #         for child in list(subtree):
        #             subtree.remove(child)
        #         subtree.insert(0, 'ARG')
        labelled_parse_tree_str = str(parse_tree)

        instance_node = SubElement(root, 'instance')
        text_node = SubElement(instance_node, 'text')
        text_node.text = text
        opinion_node = SubElement(instance_node, 'opinion')
        opinion_node.text = opinion
        opinion_tree_node = SubElement(instance_node, 'tree')
        opinion_tree_node.text = labelled_parse_tree_str

        prepared_counts[opinion] += 1

xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent='   ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(output, 'w') as f:
    f.write(xmlstr)

print(prepared_counts)