Skip to content
Snippets Groups Projects
Commit 6c6eb5e2 authored by Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented simpler annotation method.

parent 3299e4af
No related branches found
No related tags found
No related merge requests found
...@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree ...@@ -11,16 +11,17 @@ from nltk.tree import ParentedTree as Tree
import re import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv' data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated2.xml' selected_reviews_location = 'reviews_to_be_annotated.xml'
min_characters = 0 min_characters = 0
max_characters = 200 max_characters = 200
n = 500 n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'} sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'}
annotated_reviews_location = 'annotated_camera_reviews2.xml' annotated_reviews_location = 'annotated_camera_reviews2.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
tokenizer = TweetTokenizer() tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
class bcolors: class bcolors:
HEADER = '\033[95m' HEADER = '\033[95m'
...@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree): ...@@ -44,7 +45,7 @@ def get_leaf_indices(tree, phrase_tree):
# true if r1 contains r2 # true if r1 contains r2
def range_contains(r1, r2): def range_contains(r1, r2):
return r1[0] <= r2[0] and r1[1] >= r2[1] return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees()
def in_range(r, n): def in_range(r, n):
return r[0] <= n and r[1] >= n return r[0] <= n and r[1] >= n
...@@ -52,25 +53,24 @@ def in_range(r, n): ...@@ -52,25 +53,24 @@ def in_range(r, n):
# true if rs cover r # true if rs cover r
def range_cover(r, rs): def range_cover(r, rs):
for n in range(r[0],r[1]+1): for n in range(r[0],r[1]+1):
for other_r in rs: if not any(in_range(other_r, n) for other_r in rs):
if in_range(other_r, n): return False
continue
return False
return True return True
def is_opinion_target(tree):
return (tree.label() in included_labels and
all(sub.label() in included_labels or
(sub.label() == 'PRP' and sub[0].lower() == 'it')
for sub in tree.subtrees()))
def prepare_reviews(): def prepare_reviews():
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body # drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()] reviews = reviews[~reviews['review_body'].isnull()]
# # select reviews with specified review_body length
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# # filter out reviews with more than one sentence
# reviews = reviews[~reviews['review_body'].str.contains(pat='[.][^.]|<br />|[!][^!]|[?][^?]', regex=True)]
# try to filter out reviews for camera accessories # try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security'] 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
...@@ -93,6 +93,7 @@ def prepare_reviews(): ...@@ -93,6 +93,7 @@ def prepare_reviews():
for index, review in reviews.iterrows(): for index, review in reviews.iterrows():
review_node = SubElement(root, 'review') review_node = SubElement(root, 'review')
review_node.set('annotated', 'false')
id_node = SubElement(review_node, 'review_id') id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id'] id_node.text = review['review_id']
title_node = SubElement(review_node, 'product_title') title_node = SubElement(review_node, 'product_title')
...@@ -100,6 +101,7 @@ def prepare_reviews(): ...@@ -100,6 +101,7 @@ def prepare_reviews():
text_node = SubElement(review_node, 'review_body') text_node = SubElement(review_node, 'review_body')
# reformat text # reformat text
text = review['review_body'] text = review['review_body']
text = text.replace('<br />', '\n')
text = re.sub('[.][.]+', '...', text) text = re.sub('[.][.]+', '...', text)
text_node.text = text text_node.text = text
...@@ -115,14 +117,17 @@ def prepare_reviews(): ...@@ -115,14 +117,17 @@ def prepare_reviews():
parse_tree_str = nlp.parse(sentence) parse_tree_str = nlp.parse(sentence)
parse_tree = Tree.fromstring(parse_tree_str) parse_tree = Tree.fromstring(parse_tree_str)
parse_tree_node = SubElement(sentence_node, 'parse_tree')
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text') tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves()) tokenized_text_node.text = ' '.join(parse_tree.leaves())
ranges = [] ranges = []
for subtree in parse_tree.subtrees(): for subtree in parse_tree.subtrees():
if subtree.label() == 'NP': if is_opinion_target(subtree):
start, end = get_leaf_indices(parse_tree, subtree) start, end = get_leaf_indices(parse_tree, subtree)
ranges.append((start, end)) ranges.append((start, end, str(subtree)))
ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True) ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True)
ranges_to_delete = [] ranges_to_delete = []
...@@ -133,135 +138,105 @@ def prepare_reviews(): ...@@ -133,135 +138,105 @@ def prepare_reviews():
elif subranges: elif subranges:
ranges_to_delete.append(range) ranges_to_delete.append(range)
unique_ranges = list(set(ranges) - set(ranges_to_delete)) unique_ranges = list(filter(lambda r: Tree.fromstring(r[2]).label() in nouns, set(ranges) - set(ranges_to_delete)))
unique_ranges.sort(key=(lambda t: t[0])) unique_ranges.sort(key=(lambda t: t[0]))
phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges') phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges')
for range in unique_ranges: for range in unique_ranges:
phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range') phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range')
phrase_range_node.text = '{},{}'.format(range[0], range[1]) phrase_range_node.text = '{},{}'.format(range[0], range[1])
# save selected reviews
# save tree to file # save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ') xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()]) xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f: with open(selected_reviews_location, 'w') as f:
f.write(xmlstr) f.write(xmlstr)
# reviews.to_csv(selected_reviews_location, sep='\t', index=False)
def annotate_reviews(): def annotate_reviews():
row_character_count = 100 row_character_count = 100
reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False) reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
root = reviews.getroot()
annotated_reviews = parse(annotated_reviews_location) if os.path.isfile(annotated_reviews_location) else None
root = annotated_reviews.getroot() if annotated_reviews else Element('data')
n_annotated = len(root)
# filter out reviews that have been annotated already # filter out reviews that have been annotated already
annotated_review_ids = [id_node.text for id_node in root.iter('review_id')] not_annotated = [review for review in root if review.attrib['annotated'] == 'false']
n_annotated = len(root) - len(not_annotated)
for review in reviews.findall('node2'):
if review.find('review_id').text in annotated_review_ids:
reviews.remove(review)
os.system('clear') os.system('clear')
for review in reviews: for review in not_annotated:
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC) for sentence in review.find('sentences'):
print('') tokens = sentence.find('tokenized_text').text.split(' ')
phrase_ranges = sentence.find('phrase_ranges')
product_title = review['product_title'] non_ranges = []
print(bcolors.OKGREEN + product_title + bcolors.ENDC) for r in phrase_ranges:
print('') print(bcolors.WARNING + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + '\'+\': positive sentiment'+ bcolors.ENDC)
print(bcolors.OKBLUE + '\'0\': neutral/no sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'-\': negative sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'n\': not an opinion target' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'q\': quit' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.UNDERLINE + product_title + bcolors.ENDC)
print('')
start = int(r.text.split(',')[0])
end = int(r.text.split(',')[1])
text = review['review_body']
tokens = tokenizer.tokenize(text)
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
index_row = '' index_row = ''
text_row = '' text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC) for t in range(len(tokens)):
print(text_row) space = len(tokens[t]) - len(str(t))
print('')
token_text = tokens[t] + ' '
annotations = []
if t in range(start, end+1):
while True: token_text = bcolors.BOLD + token_text + bcolors.ENDC
task = input('Enter \'a\' to add an argument, \'c\' to save annotation and continue, \'s\' to skip, or \'q\' to quit: ')
text_row += token_text
if task == 'a': if t + 1 < len(tokens) and len(text_row) + len(tokens[t + 1]) + 1 > row_character_count:
rng = None print(text_row)
arg = '' text_row = ''
sentiment = '' print(text_row)
while not rng: print('')
inp = input('Enter the index (range) of the argument (in the form x or x,y): ')
if inp.isdigit(): while True:
rng = (int(inp), int(inp)) inp = input('Enter the sentiment expressed towards the highlighted argument: ')
elif (',' in inp and len(inp.split(',')) == 2 and
inp.split(',')[0].isdigit() and inp.split(',')[1].isdigit()):
rng = (int(inp.split(',')[0]), int(inp.split(',')[1]))
while not arg:
inp = input('Enter argument type (\'p\' for product, \'f\' for feature): ')
if inp in ['p', 'f']:
arg = inp
while not sentiment:
inp = input('Enter the sentiment (\'+\', \'0\', \'-\') expressed towards the argument: ')
if inp in ['+', '0', '-']: if inp in ['+', '0', '-']:
sentiment = inp r.set('sentiment_annotation', sentiment_mappings[inp])
annotations.append((rng, arg, sentiment)) os.system('clear')
break
if task in ['c', 's', 'q']:
if task in ['c', 's']: if inp in ['n']:
n_annotated += 1 non_ranges.append(r)
# save annotations to tree os.system('clear')
review_node = SubElement(root, 'review') break
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id'] elif inp in ['q']:
text_node = SubElement(review_node, 'text') os.system('clear')
text_node.text = text break
if task == 'c':
annotations_node = SubElement(review_node, 'annotations') if inp == 'q':
for annotation in annotations: break
annotation_node = SubElement(annotations_node, 'annotation')
range_node = SubElement(annotation_node, 'range') for non_range in non_ranges:
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1]) phrase_ranges.remove(non_range)
arg_node = SubElement(annotation_node, 'argument') if inp == 'q':
arg_node.text = 'product' if annotation[1] == 'p' else 'feature'
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[2]]
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(annotated_reviews_location, 'w') as f:
f.write(xmlstr)
os.system('clear')
break break
if task == 'q': if inp == 'q':
break break
else:
n_annotated += 1
review.set('annotated', 'true')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
# prepare_reviews() # prepare_reviews()
# annotate_reviews() annotate_reviews()
prepare_reviews()
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment