review_annotation.py 12.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
import pandas as pd
import math
from nltk.tokenize import TweetTokenizer
import os
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from xml.dom import minidom
import nltk.data
from stanfordcorenlp import StanfordCoreNLP
from nltk.tree import ParentedTree as Tree
import re
11
12
13
import readchar
from sty import fg, bg, ef, rs
from wcwidth import wcswidth
14

 Joel  Oksanen's avatar
Joel Oksanen committed
15
16
data_location = 'amazon_data/amazon_reviews_us_PC_v1_00.tsv'
selected_reviews_location = 'pc_reviews_to_be_annotated.xml'
17
18
19
min_characters = 0
max_characters = 200
n = 500
 Joel  Oksanen's avatar
Joel Oksanen committed
20
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
21
ann_bgs = {'positive': bg.green, 'neutral': bg.blue, 'negative': bg.red, 'conflict': bg.yellow}
22
annotated_reviews_location = 'annotated_camera_reviews.xml'
23
24
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
 Joel  Oksanen's avatar
Joel Oksanen committed
25
prepared_reviews_location = 'annotated_amazon_laptop_reviews.xml'
26
27
28
29

tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

30

31
32
33
34
35
36
37
38
39
40
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

41

42
43
44
45
46
47
48
49
50
51
def get_leaf_indices(tree, phrase_tree):
    phrase_tree_pos = phrase_tree.treeposition()
    start = 0
    while tree.leaf_treeposition(start)[:len(phrase_tree_pos)] != phrase_tree_pos:
        start += 1
    end = start
    while end + 1 < len(tree.leaves()) and tree.leaf_treeposition(end + 1)[:len(phrase_tree_pos)] == phrase_tree_pos:
        end += 1
    return (start, end)

52

53
54
# true if r1 contains r2
def range_contains(r1, r2):
55
    return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees()
56

57

58
59
60
def in_range(r, n):
    return r[0] <= n and r[1] >= n

61

62
63
# true if rs cover r
def range_cover(r, rs):
64
    for n in range(r[0], r[1] + 1):
65
        if not any(in_range(other_r, n) for other_r in rs):
66
            return False
67
68
    return True

69

70
71
72
73
74
def is_opinion_target(tree):
    return (tree.label() in included_labels and
            all(sub.label() in included_labels or
                (sub.label() == 'PRP' and sub[0].lower() == 'it')
                for sub in tree.subtrees()))
75

76

77
78
79
80
81
82
def prepare_reviews():
    reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)

    # drop reviews with empty review body
    reviews = reviews[~reviews['review_body'].isnull()]

 Joel  Oksanen's avatar
Joel Oksanen committed
83
84
85
86
    # laptop reviews
    reviews = reviews[reviews['product_title'].str.contains('laptop', case=False, na=False)]

    # try to filter out reviews for accessories
87
    filter_words = ['accessor', 'batter', 'charger', 'tripod', 'strap', 'case', 'bag', 'filter',
88
                    'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
89
90
91
92
93
    filter_pat = ''
    for word in filter_words:
        word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
        filter_pat += word_filter + '|'
    filter_pat = filter_pat[:-1]
 Joel  Oksanen's avatar
Joel Oksanen committed
94
    reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True, case=False, na=False)]
95
96
97
98
99
100
101

    # shuffle reviews
    reviews = reviews.sample(frac=1).reset_index(drop=True)

    # pick first n reviews
    reviews = reviews.head(n)

102
    nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
103
104
105

    root = Element('reviews')

106
    for _, review in reviews.iterrows():
107
        review_node = SubElement(root, 'review')
108
        review_node.set('annotated', 'false')
109
110
111
112
113
114
115
        id_node = SubElement(review_node, 'review_id')
        id_node.text = review['review_id']
        title_node = SubElement(review_node, 'product_title')
        title_node.text = review['product_title']
        text_node = SubElement(review_node, 'review_body')
        # reformat text
        text = review['review_body']
116
        text = text.replace('<br />', '\n')
117
        text = re.sub('[.][.]+', '...', text)
118
119
        text = text.replace('&#34;', '"')
        text = re.sub('[&][#][0-9]+[;]', ' ', text)
120
121
122
123
124
125
126
127
128
129
130
131
132
133
        text_node.text = text

        sentences_node = SubElement(review_node, 'sentences')

        sentences = sent_tokenizer.tokenize(text)
        phrase_indices = []
        for sentence in sentences:
            sentence_node = SubElement(sentences_node, 'sentence')
            sentence_text_node = SubElement(sentence_node, 'text')
            sentence_text_node.text = sentence

            parse_tree_str = nlp.parse(sentence)
            parse_tree = Tree.fromstring(parse_tree_str)

134
135
136
            parse_tree_node = SubElement(sentence_node, 'parse_tree')
            parse_tree_node.text = parse_tree_str

137
            tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
138
            tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``', '""')
139
140
141
142
143
144
145

    # save tree to file
    xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent='   ')
    xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
    with open(selected_reviews_location, 'w') as f:
        f.write(xmlstr)

146
147
    print('Obtained and parsed', len(reviews), 'reviews')

148

149
150
def annotate_reviews():
    row_character_count = 100
151
152
    reviews = parse(selected_reviews_location)

153
    root = reviews.getroot()
154
155

    # filter out reviews that have been annotated already
156
157
    not_annotated = [review for review in root if review.attrib['annotated'] == 'false']
    n_annotated = len(root) - len(not_annotated)
158

159
160
    for review in not_annotated:
        for sentence in review.find('sentences'):
161
162
163
164
            text = sentence.find('text').text
            cursor_pos = 0
            start = None
            end = None
165
166
167
168

            annotations = []

            while True:
169
170
171
172
173
174
175
                os.system('clear')

                print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
                print('')

                print(bcolors.OKBLUE + 'next:       \'n\'' + bcolors.ENDC)
                print(bcolors.OKBLUE + 'skip:       \'s\'' + bcolors.ENDC)
176
                print(bcolors.OKBLUE + 'undo:       \'u\'' + bcolors.ENDC)
177
178
179
180
181
182
183
184
185
186
                print(bcolors.OKBLUE + 'quit:       \'q\'' + bcolors.ENDC)
                print('')

                product_title = review.find('product_title').text
                print(bcolors.OKGREEN + product_title + bcolors.ENDC)
                print('')

                text_row = ''
                for t in range(len(text)):
                    char = text[t]
187
188
189
190
191
192

                    if start != None and cursor_pos >= start and t in range(start, cursor_pos+1):
                        char = bg.li_black + char + bg.rs
                    elif t == cursor_pos:
                        char = bg.li_black + char + bg.rs

193
194
195
                    for ann in annotations:
                        if t in range(ann[0][0], ann[0][1]):
                            char = ann_bgs[ann[1]] + char + bg.rs
196

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
                    text_row += char

                    if (t + 1) % row_character_count == 0:
                        print(text_row)
                        text_row = ''
                print(text_row)
                print('')

                task = readchar.readkey()

                if task == readchar.key.RIGHT:
                    cursor_pos = min(cursor_pos + 1, len(text) - 1)
                if task == readchar.key.LEFT:
                    cursor_pos = max(cursor_pos - 1, 0)
                if task == readchar.key.DOWN:
                    cursor_pos = min(cursor_pos + row_character_count, len(text) - 1)
                if task == readchar.key.UP:
                    cursor_pos = max(cursor_pos - row_character_count, 0)

                if task == readchar.key.SPACE:
                    if start == None:
                        start = cursor_pos
                    elif end == None and cursor_pos >= start:
                        end = cursor_pos+1
                        rng = (start, end)
                        while True:
                            inp = input('Sentiment for {},{}: '.format(start, end-1))
                            if inp in sentiment_mappings.keys():
                                annotations.append((rng, sentiment_mappings[inp]))
                                start = None
                                end = None
                                cursor_pos = min(cursor_pos + 1, len(text) - 1)
                                break
230

231
232
233
                if task == 'u' and annotations:
                    del annotations[-1]

 Joel  Oksanen's avatar
Joel Oksanen committed
234
                if task in ['n', 's', 'q']:
235
                    if task in ['n'] and annotations:
236
237
238
239
240
241
242
                        # save annotations to tree
                        annotations_node = SubElement(sentence, 'annotations')
                        for annotation in annotations:
                            annotation_node = SubElement(annotations_node, 'annotation')
                            range_node = SubElement(annotation_node, 'range')
                            range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1])
                            sent_node = SubElement(annotation_node, 'sentiment')
243
                            sent_node.text = annotation[1]
244
                    break
245
246

            if task == 'q' or task == 's':
247
                break
248

249
250
251
        if task == 'q':
            os.system('clear')
            break
252
253
254
255
256
        elif task == 's':
            root.remove(review)
        elif task == 'n':
            n_annotated += 1
            review.set('annotated', 'true')
257

258
259
260
261
262
263
        # save tree to file
        xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent='   ')
        xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
        with open(selected_reviews_location, 'w') as f:
            f.write(xmlstr)

264

265
266
267
268
def longest_common_subsequence(x, y):
    seq = []
    for i in range(min(len(x), len(y))):
        if x[i] != y[i]:
269
            break
270
271
272
273
        seq.append(x[i])

    return tuple(seq)

274

275
276
277
278
279
280
281
def labelled_tree_str(tree_str, start, end):
    tree = Tree.fromstring(tree_str)
    start_pos = tree.leaf_treeposition(start)
    end_pos = tree.leaf_treeposition(end)

    # find highest parent node common to start and end
    if start == end:
282
        parent_pos = start_pos[:len(start_pos) - 1]
283
284
285
286
287
288
289
290
    else:
        parent_pos = longest_common_subsequence(start_pos, end_pos)
    parent_node = tree[parent_pos]
    while len(parent_node.parent()) == 1:
        parent_node = parent_node.parent()
        parent_pos = parent_pos[:len(parent_pos) - 1]

    # remove branches between start and end inclusive
291
    child_index_rng = range(start_pos[len(parent_pos)], end_pos[len(parent_pos)] + 1)
292
293
294
295
296
297
298
299
300
301
    child_positions = [list(parent_pos) + [i] for i in child_index_rng]
    children_to_remove = [tree[tuple(child_pos)] for child_pos in child_positions]
    for child in children_to_remove:
        parent_node.remove(child)

    # insert ARG in place of removed branches
    parent_node.insert(child_index_rng[0], 'ARG')

    return str(tree)

302

303
304
305
306
307
def prepare_annotated_reviews():
    reviews = parse(selected_reviews_location)
    root = reviews.getroot()
    annotated = [review for review in root if review.attrib['annotated'] == 'true']

308
    prepared_root = Element('sentences')
309
310
311
312
313

    for review in annotated:
        for sentence in review.find('sentences'):
            text = sentence.find('text').text
            tree_str = sentence.find('parse_tree').text
314
315
316
317
318
319
320
321
322
323
            sentence_node = SubElement(prepared_root, 'sentence')
            text_node = SubElement(sentence_node, 'text')
            text_node.text = text

            if sentence.find('annotations'):
                aspect_terms_node = SubElement(sentence_node, 'aspectTerms')

                for annotation in sentence.find('annotations'):
                    start, end = annotation.find('range').text.split(',')
                    aspect_term_node = SubElement(aspect_terms_node, 'aspectTerm')
324
                    aspect_term_node.set('term', text[int(start):int(end)])
325
326
327
                    aspect_term_node.set('polarity', annotation.find('sentiment').text)
                    aspect_term_node.set('from', start)
                    aspect_term_node.set('to', end)
328

329
    xmlstr = minidom.parseString(tostring(prepared_root)).toprettyxml(indent='   ')
330
    xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
 Joel  Oksanen's avatar
Joel Oksanen committed
331
    with open(prepared_reviews_location, 'w') as f:
332
        f.write(xmlstr)
333

334

335
# prepare_reviews()
336
337
# annotate_reviews()
prepare_annotated_reviews()