Commit 6c6eb5e2 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented simpler annotation method.

parent 3299e4af
......@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated2.xml'
selected_reviews_location = 'reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'}
annotated_reviews_location = 'annotated_camera_reviews2.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
class bcolors:
HEADER = '\033[95m'
......@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree):
# true if r1 contains r2
def range_contains(r1, r2):
return r1[0] <= r2[0] and r1[1] >= r2[1]
return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees()
def in_range(r, n):
return r[0] <= n and r[1] >= n
......@@ -52,25 +53,24 @@ def in_range(r, n):
# true if rs cover r
def range_cover(r, rs):
for n in range(r[0],r[1]+1):
for other_r in rs:
if in_range(other_r, n):
continue
return False
if not any(in_range(other_r, n) for other_r in rs):
return False
return True
def is_opinion_target(tree):
return (tree.label() in included_labels and
all(sub.label() in included_labels or
(sub.label() == 'PRP' and sub[0].lower() == 'it')
for sub in tree.subtrees()))
def prepare_reviews():
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# # select reviews with specified review_body length
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# # filter out reviews with more than one sentence
# reviews = reviews[~reviews['review_body'].str.contains(pat='[.][^.]|<br />|[!][^!]|[?][^?]', regex=True)]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
......@@ -93,6 +93,7 @@ def prepare_reviews():
for index, review in reviews.iterrows():
review_node = SubElement(root, 'review')
review_node.set('annotated', 'false')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
title_node = SubElement(review_node, 'product_title')
......@@ -100,6 +101,7 @@ def prepare_reviews():
text_node = SubElement(review_node, 'review_body')
# reformat text
text = review['review_body']
text = text.replace('<br />', '\n')
text = re.sub('[.][.]+', '...', text)
text_node.text = text
......@@ -115,14 +117,17 @@ def prepare_reviews():
parse_tree_str = nlp.parse(sentence)
parse_tree = Tree.fromstring(parse_tree_str)
parse_tree_node = SubElement(sentence_node, 'parse_tree')
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves())
ranges = []
for subtree in parse_tree.subtrees():
if subtree.label() == 'NP':
if is_opinion_target(subtree):
start, end = get_leaf_indices(parse_tree, subtree)
ranges.append((start, end))
ranges.append((start, end, str(subtree)))
ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True)
ranges_to_delete = []
......@@ -133,135 +138,105 @@ def prepare_reviews():
elif subranges:
ranges_to_delete.append(range)
unique_ranges = list(set(ranges) - set(ranges_to_delete))
unique_ranges = list(filter(lambda r: Tree.fromstring(r[2]).label() in nouns, set(ranges) - set(ranges_to_delete)))
unique_ranges.sort(key=(lambda t: t[0]))
phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges')
for range in unique_ranges:
phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range')
phrase_range_node.text = '{},{}'.format(range[0], range[1])
# save selected reviews
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
# reviews.to_csv(selected_reviews_location, sep='\t', index=False)
def annotate_reviews():
row_character_count = 100
reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
annotated_reviews = parse(annotated_reviews_location) if os.path.isfile(annotated_reviews_location) else None
root = annotated_reviews.getroot() if annotated_reviews else Element('data')
n_annotated = len(root)
root = reviews.getroot()
# filter out reviews that have been annotated already
annotated_review_ids = [id_node.text for id_node in root.iter('review_id')]
for review in reviews.findall('node2'):
if review.find('review_id').text in annotated_review_ids:
reviews.remove(review)
not_annotated = [review for review in root if review.attrib['annotated'] == 'false']
n_annotated = len(root) - len(not_annotated)
os.system('clear')
for review in reviews:
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
product_title = review['product_title']
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print('')
for review in not_annotated:
for sentence in review.find('sentences'):
tokens = sentence.find('tokenized_text').text.split(' ')
phrase_ranges = sentence.find('phrase_ranges')
non_ranges = []
for r in phrase_ranges:
print(bcolors.WARNING + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + '\'+\': positive sentiment'+ bcolors.ENDC)
print(bcolors.OKBLUE + '\'0\': neutral/no sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'-\': negative sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'n\': not an opinion target' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'q\': quit' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.UNDERLINE + product_title + bcolors.ENDC)
print('')
start = int(r.text.split(',')[0])
end = int(r.text.split(',')[1])
text = review['review_body']
tokens = tokenizer.tokenize(text)
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
index_row = ''
text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
print('')
annotations = []
while True:
task = input('Enter \'a\' to add an argument, \'c\' to save annotation and continue, \'s\' to skip, or \'q\' to quit: ')
if task == 'a':
rng = None
arg = ''
sentiment = ''
while not rng:
inp = input('Enter the index (range) of the argument (in the form x or x,y): ')
if inp.isdigit():
rng = (int(inp), int(inp))
elif (',' in inp and len(inp.split(',')) == 2 and
inp.split(',')[0].isdigit() and inp.split(',')[1].isdigit()):
rng = (int(inp.split(',')[0]), int(inp.split(',')[1]))
while not arg:
inp = input('Enter argument type (\'p\' for product, \'f\' for feature): ')
if inp in ['p', 'f']:
arg = inp
while not sentiment:
inp = input('Enter the sentiment (\'+\', \'0\', \'-\') expressed towards the argument: ')
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
if t in range(start, end+1):
token_text = bcolors.BOLD + token_text + bcolors.ENDC
text_row += token_text
if t + 1 < len(tokens) and len(text_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(text_row)
text_row = ''
print(text_row)
print('')
while True:
inp = input('Enter the sentiment expressed towards the highlighted argument: ')
if inp in ['+', '0', '-']:
sentiment = inp
annotations.append((rng, arg, sentiment))
if task in ['c', 's', 'q']:
if task in ['c', 's']:
n_annotated += 1
# save annotations to tree
review_node = SubElement(root, 'review')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
text_node = SubElement(review_node, 'text')
text_node.text = text
if task == 'c':
annotations_node = SubElement(review_node, 'annotations')
for annotation in annotations:
annotation_node = SubElement(annotations_node, 'annotation')
range_node = SubElement(annotation_node, 'range')
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1])
arg_node = SubElement(annotation_node, 'argument')
arg_node.text = 'product' if annotation[1] == 'p' else 'feature'
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[2]]
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(annotated_reviews_location, 'w') as f:
f.write(xmlstr)
os.system('clear')
r.set('sentiment_annotation', sentiment_mappings[inp])
os.system('clear')
break
if inp in ['n']:
non_ranges.append(r)
os.system('clear')
break
elif inp in ['q']:
os.system('clear')
break
if inp == 'q':
break
for non_range in non_ranges:
phrase_ranges.remove(non_range)
if inp == 'q':
break
if task == 'q':
if inp == 'q':
break
else:
n_annotated += 1
review.set('annotated', 'true')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
# prepare_reviews()
# annotate_reviews()
prepare_reviews()
annotate_reviews()
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment