Commit 6c6eb5e2 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented simpler annotation method.

parent 3299e4af
...@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree ...@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree
import re import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv' data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated2.xml' selected_reviews_location = 'reviews_to_be_annotated.xml'
min_characters = 0 min_characters = 0
max_characters = 200 max_characters = 200
n = 500 n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'} sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'}
annotated_reviews_location = 'annotated_camera_reviews2.xml' annotated_reviews_location = 'annotated_camera_reviews2.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
tokenizer = TweetTokenizer() tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
class bcolors: class bcolors:
HEADER = '\033[95m' HEADER = '\033[95m'
...@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree): ...@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree):
# true if r1 contains r2 # true if r1 contains r2
def range_contains(r1, r2): def range_contains(r1, r2):
return r1[0] <= r2[0] and r1[1] >= r2[1] return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees()
def in_range(r, n): def in_range(r, n):
return r[0] <= n and r[1] >= n return r[0] <= n and r[1] >= n
...@@ -52,25 +53,24 @@ def in_range(r, n): ...@@ -52,25 +53,24 @@ def in_range(r, n):
# true if rs cover r # true if rs cover r
def range_cover(r, rs): def range_cover(r, rs):
for n in range(r[0],r[1]+1): for n in range(r[0],r[1]+1):
for other_r in rs: if not any(in_range(other_r, n) for other_r in rs):
if in_range(other_r, n): return False
continue
return False
return True return True
def is_opinion_target(tree):
return (tree.label() in included_labels and
all(sub.label() in included_labels or
(sub.label() == 'PRP' and sub[0].lower() == 'it')
for sub in tree.subtrees()))
def prepare_reviews(): def prepare_reviews():
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body # drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()] reviews = reviews[~reviews['review_body'].isnull()]
# # select reviews with specified review_body length
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# # filter out reviews with more than one sentence
# reviews = reviews[~reviews['review_body'].str.contains(pat='[.][^.]|<br />|[!][^!]|[?][^?]', regex=True)]
# try to filter out reviews for camera accessories # try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security'] 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
...@@ -93,6 +93,7 @@ def prepare_reviews(): ...@@ -93,6 +93,7 @@ def prepare_reviews():
for index, review in reviews.iterrows(): for index, review in reviews.iterrows():
review_node = SubElement(root, 'review') review_node = SubElement(root, 'review')
review_node.set('annotated', 'false')
id_node = SubElement(review_node, 'review_id') id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id'] id_node.text = review['review_id']
title_node = SubElement(review_node, 'product_title') title_node = SubElement(review_node, 'product_title')
...@@ -100,6 +101,7 @@ def prepare_reviews(): ...@@ -100,6 +101,7 @@ def prepare_reviews():
text_node = SubElement(review_node, 'review_body') text_node = SubElement(review_node, 'review_body')
# reformat text # reformat text
text = review['review_body'] text = review['review_body']
text = text.replace('<br />', '\n')
text = re.sub('[.][.]+', '...', text) text = re.sub('[.][.]+', '...', text)
text_node.text = text text_node.text = text
...@@ -115,14 +117,17 @@ def prepare_reviews(): ...@@ -115,14 +117,17 @@ def prepare_reviews():
parse_tree_str = nlp.parse(sentence) parse_tree_str = nlp.parse(sentence)
parse_tree = Tree.fromstring(parse_tree_str) parse_tree = Tree.fromstring(parse_tree_str)
parse_tree_node = SubElement(sentence_node, 'parse_tree')
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text') tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves()) tokenized_text_node.text = ' '.join(parse_tree.leaves())
ranges = [] ranges = []
for subtree in parse_tree.subtrees(): for subtree in parse_tree.subtrees():
if subtree.label() == 'NP': if is_opinion_target(subtree):
start, end = get_leaf_indices(parse_tree, subtree) start, end = get_leaf_indices(parse_tree, subtree)
ranges.append((start, end)) ranges.append((start, end, str(subtree)))
ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True) ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True)
ranges_to_delete = [] ranges_to_delete = []
...@@ -133,135 +138,105 @@ def prepare_reviews(): ...@@ -133,135 +138,105 @@ def prepare_reviews():
elif subranges: elif subranges:
ranges_to_delete.append(range) ranges_to_delete.append(range)
unique_ranges = list(set(ranges) - set(ranges_to_delete)) unique_ranges = list(filter(lambda r: Tree.fromstring(r[2]).label() in nouns, set(ranges) - set(ranges_to_delete)))
unique_ranges.sort(key=(lambda t: t[0])) unique_ranges.sort(key=(lambda t: t[0]))
phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges') phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges')
for range in unique_ranges: for range in unique_ranges:
phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range') phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range')
phrase_range_node.text = '{},{}'.format(range[0], range[1]) phrase_range_node.text = '{},{}'.format(range[0], range[1])
# save selected reviews
# save tree to file # save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ') xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()]) xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f: with open(selected_reviews_location, 'w') as f:
f.write(xmlstr) f.write(xmlstr)
# reviews.to_csv(selected_reviews_location, sep='\t', index=False)
def annotate_reviews(): def annotate_reviews():
row_character_count = 100 row_character_count = 100
reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False) reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
root = reviews.getroot()
annotated_reviews = parse(annotated_reviews_location) if os.path.isfile(annotated_reviews_location) else None
root = annotated_reviews.getroot() if annotated_reviews else Element('data')
n_annotated = len(root)
# filter out reviews that have been annotated already # filter out reviews that have been annotated already
annotated_review_ids = [id_node.text for id_node in root.iter('review_id')] not_annotated = [review for review in root if review.attrib['annotated'] == 'false']
n_annotated = len(root) - len(not_annotated)
for review in reviews.findall('node2'):
if review.find('review_id').text in annotated_review_ids:
reviews.remove(review)
os.system('clear') os.system('clear')
for review in reviews: for review in not_annotated:
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC) for sentence in review.find('sentences'):
print('') tokens = sentence.find('tokenized_text').text.split(' ')
phrase_ranges = sentence.find('phrase_ranges')
product_title = review['product_title'] non_ranges = []
print(bcolors.OKGREEN + product_title + bcolors.ENDC) for r in phrase_ranges:
print('') print(bcolors.WARNING + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + '\'+\': positive sentiment'+ bcolors.ENDC)
print(bcolors.OKBLUE + '\'0\': neutral/no sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'-\': negative sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'n\': not an opinion target' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'q\': quit' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.UNDERLINE + product_title + bcolors.ENDC)
print('')
start = int(r.text.split(',')[0])
end = int(r.text.split(',')[1])
text = review['review_body']
tokens = tokenizer.tokenize(text)
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
index_row = '' index_row = ''
text_row = '' text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC) for t in range(len(tokens)):
print(text_row) space = len(tokens[t]) - len(str(t))
print('')
token_text = tokens[t] + ' '
annotations = []
if t in range(start, end+1):
while True: token_text = bcolors.BOLD + token_text + bcolors.ENDC
task = input('Enter \'a\' to add an argument, \'c\' to save annotation and continue, \'s\' to skip, or \'q\' to quit: ')
text_row += token_text
if task == 'a': if t + 1 < len(tokens) and len(text_row) + len(tokens[t + 1]) + 1 > row_character_count:
rng = None print(text_row)
arg = '' text_row = ''
sentiment = '' print(text_row)
while not rng: print('')
inp = input('Enter the index (range) of the argument (in the form x or x,y): ')
if inp.isdigit(): while True:
rng = (int(inp), int(inp)) inp = input('Enter the sentiment expressed towards the highlighted argument: ')
elif (',' in inp and len(inp.split(',')) == 2 and
inp.split(',')[0].isdigit() and inp.split(',')[1].isdigit()):
rng = (int(inp.split(',')[0]), int(inp.split(',')[1]))
while not arg:
inp = input('Enter argument type (\'p\' for product, \'f\' for feature): ')
if inp in ['p', 'f']:
arg = inp
while not sentiment:
inp = input('Enter the sentiment (\'+\', \'0\', \'-\') expressed towards the argument: ')
if inp in ['+', '0', '-']: if inp in ['+', '0', '-']:
sentiment = inp r.set('sentiment_annotation', sentiment_mappings[inp])
annotations.append((rng, arg, sentiment)) os.system('clear')
break
if task in ['c', 's', 'q']:
if task in ['c', 's']: if inp in ['n']:
n_annotated += 1 non_ranges.append(r)
# save annotations to tree os.system('clear')
review_node = SubElement(root, 'review') break
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id'] elif inp in ['q']:
text_node = SubElement(review_node, 'text') os.system('clear')
text_node.text = text break
if task == 'c':
annotations_node = SubElement(review_node, 'annotations') if inp == 'q':
for annotation in annotations: break
annotation_node = SubElement(annotations_node, 'annotation')
range_node = SubElement(annotation_node, 'range') for non_range in non_ranges:
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1]) phrase_ranges.remove(non_range)
arg_node = SubElement(annotation_node, 'argument') if inp == 'q':
arg_node.text = 'product' if annotation[1] == 'p' else 'feature'
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[2]]
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(annotated_reviews_location, 'w') as f:
f.write(xmlstr)
os.system('clear')
break break
if task == 'q': if inp == 'q':
break break
else:
n_annotated += 1
review.set('annotated', 'true')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
# prepare_reviews() # prepare_reviews()
# annotate_reviews() annotate_reviews()
prepare_reviews()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment