Commit 59076bb3 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Annotated some Amazon reviews

parent 16cb2dcc
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -18,7 +18,7 @@ min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
ann_bgs = {'positive': bg.green, 'neutral': bg.li_black, 'negative': bg.red, 'conflict': bg.yellow}
ann_bgs = {'positive': bg.green, 'neutral': bg.blue, 'negative': bg.red, 'conflict': bg.yellow}
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
......@@ -81,7 +81,7 @@ def prepare_reviews():
reviews = reviews[~reviews['review_body'].isnull()]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
filter_words = ['accessor', 'batter', 'charger', 'tripod', 'strap', 'case', 'bag', 'filter',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
......@@ -170,6 +170,7 @@ def annotate_reviews():
print(bcolors.OKBLUE + 'next: \'n\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'undo: \'u\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
......@@ -180,11 +181,16 @@ def annotate_reviews():
text_row = ''
for t in range(len(text)):
char = text[t]
if t == cursor_pos:
char = bg.blue + char + bg.rs
if start != None and cursor_pos >= start and t in range(start, cursor_pos+1):
char = bg.li_black + char + bg.rs
elif t == cursor_pos:
char = bg.li_black + char + bg.rs
for ann in annotations:
if t in range(ann[0][0], ann[0][1]):
char = ann_bgs[ann[1]] + char + bg.rs
text_row += char
if (t + 1) % row_character_count == 0:
......@@ -219,8 +225,11 @@ def annotate_reviews():
cursor_pos = min(cursor_pos + 1, len(text) - 1)
break
if task == 'u' and annotations:
del annotations[-1]
if task in ['n', 's', 'q']:
if task in ['n']:
if task in ['n'] and annotations:
# save annotations to tree
annotations_node = SubElement(sentence, 'annotations')
for annotation in annotations:
......@@ -230,15 +239,20 @@ def annotate_reviews():
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = annotation[1]
break
if task == 'q':
if task == 'q' or task == 's':
break
if task == 'q':
os.system('clear')
break
elif task == 's':
root.remove(review)
elif task == 'n':
n_annotated += 1
review.set('annotated', 'true')
# save tree to file
n_annotated += 1
review.set('annotated', 'true')
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
......@@ -304,38 +318,17 @@ def prepare_annotated_reviews():
for annotation in sentence.find('annotations'):
start, end = annotation.find('range').text.split(',')
aspect_term_node = SubElement(aspect_terms_node, 'aspectTerm')
aspect_term_node.set('term', text[start:end])
aspect_term_node.set('term', text[int(start):int(end)])
aspect_term_node.set('polarity', annotation.find('sentiment').text)
aspect_term_node.set('from', start)
aspect_term_node.set('to', end)
train_count = 1000
train_root = Element('data')
test_root = Element('data')
counts = {'positive': 0, 'neutral': 0, 'negative': 0, 'conflict': 0}
for instance in prepared_root:
if counts[instance.find('opinion').text] < train_count:
train_root.append(instance)
else:
test_root.append(instance)
counts[instance.find('opinion').text] += 1
print(counts)
print(len(train_root))
print(len(test_root))
xmlstr = minidom.parseString(tostring(train_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open('amazon_camera_train.xml', 'w') as f:
f.write(xmlstr)
xmlstr = minidom.parseString(tostring(test_root)).toprettyxml(indent=' ')
xmlstr = minidom.parseString(tostring(prepared_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open('amazon_camera_test.xml', 'w') as f:
f.write(xmlstr)
# prepare_reviews()
annotate_reviews()
# prepare_annotated_reviews()
# annotate_reviews()
prepare_annotated_reviews()
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment