Commit 6c6eb5e2 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented simpler annotation method.

parent 3299e4af
......@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated2.xml'
selected_reviews_location = 'reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'}
annotated_reviews_location = 'annotated_camera_reviews2.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
class bcolors:
HEADER = '\033[95m'
......@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree):
# true if r1 contains r2
def range_contains(r1, r2):
return r1[0] <= r2[0] and r1[1] >= r2[1]
return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees()
def in_range(r, n):
return r[0] <= n and r[1] >= n
......@@ -52,25 +53,24 @@ def in_range(r, n):
# true if rs cover r
def range_cover(r, rs):
for n in range(r[0],r[1]+1):
for other_r in rs:
if in_range(other_r, n):
continue
if not any(in_range(other_r, n) for other_r in rs):
return False
return True
def is_opinion_target(tree):
return (tree.label() in included_labels and
all(sub.label() in included_labels or
(sub.label() == 'PRP' and sub[0].lower() == 'it')
for sub in tree.subtrees()))
def prepare_reviews():
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# # select reviews with specified review_body length
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# # filter out reviews with more than one sentence
# reviews = reviews[~reviews['review_body'].str.contains(pat='[.][^.]|<br />|[!][^!]|[?][^?]', regex=True)]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
......@@ -93,6 +93,7 @@ def prepare_reviews():
for index, review in reviews.iterrows():
review_node = SubElement(root, 'review')
review_node.set('annotated', 'false')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
title_node = SubElement(review_node, 'product_title')
......@@ -100,6 +101,7 @@ def prepare_reviews():
text_node = SubElement(review_node, 'review_body')
# reformat text
text = review['review_body']
text = text.replace('<br />', '\n')
text = re.sub('[.][.]+', '...', text)
text_node.text = text
......@@ -115,14 +117,17 @@ def prepare_reviews():
parse_tree_str = nlp.parse(sentence)
parse_tree = Tree.fromstring(parse_tree_str)
parse_tree_node = SubElement(sentence_node, 'parse_tree')
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves())
ranges = []
for subtree in parse_tree.subtrees():
if subtree.label() == 'NP':
if is_opinion_target(subtree):
start, end = get_leaf_indices(parse_tree, subtree)
ranges.append((start, end))
ranges.append((start, end, str(subtree)))
ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True)
ranges_to_delete = []
......@@ -133,52 +138,52 @@ def prepare_reviews():
elif subranges:
ranges_to_delete.append(range)
unique_ranges = list(set(ranges) - set(ranges_to_delete))
unique_ranges = list(filter(lambda r: Tree.fromstring(r[2]).label() in nouns, set(ranges) - set(ranges_to_delete)))
unique_ranges.sort(key=(lambda t: t[0]))
phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges')
for range in unique_ranges:
phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range')
phrase_range_node.text = '{},{}'.format(range[0], range[1])
# save selected reviews
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
# reviews.to_csv(selected_reviews_location, sep='\t', index=False)
def annotate_reviews():
row_character_count = 100
reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
annotated_reviews = parse(annotated_reviews_location) if os.path.isfile(annotated_reviews_location) else None
root = annotated_reviews.getroot() if annotated_reviews else Element('data')
n_annotated = len(root)
root = reviews.getroot()
# filter out reviews that have been annotated already
annotated_review_ids = [id_node.text for id_node in root.iter('review_id')]
for review in reviews.findall('node2'):
if review.find('review_id').text in annotated_review_ids:
reviews.remove(review)
not_annotated = [review for review in root if review.attrib['annotated'] == 'false']
n_annotated = len(root) - len(not_annotated)
os.system('clear')
for review in reviews:
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
for review in not_annotated:
for sentence in review.find('sentences'):
tokens = sentence.find('tokenized_text').text.split(' ')
phrase_ranges = sentence.find('phrase_ranges')
non_ranges = []
for r in phrase_ranges:
print(bcolors.WARNING + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
product_title = review['product_title']
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print(bcolors.OKBLUE + '\'+\': positive sentiment'+ bcolors.ENDC)
print(bcolors.OKBLUE + '\'0\': neutral/no sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'-\': negative sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'n\': not an opinion target' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'q\': quit' + bcolors.ENDC)
print('')
text = review['review_body']
tokens = tokenizer.tokenize(text)
product_title = review.find('product_title').text
print(bcolors.UNDERLINE + product_title + bcolors.ENDC)
print('')
start = int(r.text.split(',')[0])
end = int(r.text.split(',')[1])
index_row = ''
text_row = ''
......@@ -186,82 +191,52 @@ def annotate_reviews():
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
if t in range(start, end+1):
token_text = bcolors.BOLD + token_text + bcolors.ENDC
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
if t + 1 < len(tokens) and len(text_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(text_row)
index_row = ''
text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
print('')
annotations = []
while True:
task = input('Enter \'a\' to add an argument, \'c\' to save annotation and continue, \'s\' to skip, or \'q\' to quit: ')
if task == 'a':
rng = None
arg = ''
sentiment = ''
while not rng:
inp = input('Enter the index (range) of the argument (in the form x or x,y): ')
if inp.isdigit():
rng = (int(inp), int(inp))
elif (',' in inp and len(inp.split(',')) == 2 and
inp.split(',')[0].isdigit() and inp.split(',')[1].isdigit()):
rng = (int(inp.split(',')[0]), int(inp.split(',')[1]))
while not arg:
inp = input('Enter argument type (\'p\' for product, \'f\' for feature): ')
if inp in ['p', 'f']:
arg = inp
while not sentiment:
inp = input('Enter the sentiment (\'+\', \'0\', \'-\') expressed towards the argument: ')
inp = input('Enter the sentiment expressed towards the highlighted argument: ')
if inp in ['+', '0', '-']:
sentiment = inp
annotations.append((rng, arg, sentiment))
r.set('sentiment_annotation', sentiment_mappings[inp])
os.system('clear')
break
if task in ['c', 's', 'q']:
if task in ['c', 's']:
n_annotated += 1
# save annotations to tree
review_node = SubElement(root, 'review')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
text_node = SubElement(review_node, 'text')
text_node.text = text
if task == 'c':
annotations_node = SubElement(review_node, 'annotations')
for annotation in annotations:
annotation_node = SubElement(annotations_node, 'annotation')
range_node = SubElement(annotation_node, 'range')
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1])
arg_node = SubElement(annotation_node, 'argument')
arg_node.text = 'product' if annotation[1] == 'p' else 'feature'
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[2]]
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(annotated_reviews_location, 'w') as f:
f.write(xmlstr)
if inp in ['n']:
non_ranges.append(r)
os.system('clear')
break
elif inp in ['q']:
os.system('clear')
break
if task == 'q':
if inp == 'q':
break
# prepare_reviews()
# annotate_reviews()
for non_range in non_ranges:
phrase_ranges.remove(non_range)
if inp == 'q':
break
if inp == 'q':
break
else:
n_annotated += 1
review.set('annotated', 'true')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
prepare_reviews()
# prepare_reviews()
annotate_reviews()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment