Commit 3f3df636 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Reverted back to old method of annotation. Annotated around 50 reviews with...

Reverted back to old method of annotation. Annotated around 50 reviews with 605 opinion targets. Implemented function to write these into standardised xml file form for SA
parent 6c6eb5e2
<?xml version="1.0" ?>
<data>
<review>
<review_id>R1ZPQA9FFF258V</review_id>
<text>Inexpensive item that does what it says it does!</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R2CHMZYG2ANFNF</review_id>
<text>Great little siren, super loud.</text>
<annotations>
<annotation>
<range>2,2</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R3IL3NIR66ZIT1</review_id>
<text>Excellent article, very good product, I use it on a nikon D7100 and works very well.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
<annotation>
<range>5,5</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RITG7J2UJ8E8K</review_id>
<text>perfect</text>
<annotations/>
</review>
<review>
<review_id>R3FBMYUZMRDS2B</review_id>
<text>all right, thx</text>
<annotations/>
</review>
<review>
<review_id>R69WZSBAKCVGT</review_id>
<text>Excellent</text>
<annotations/>
</review>
<review>
<review_id>R2PMD968AXPVQE</review_id>
<text>Works great and fits in a case great!</text>
<annotations>
<annotation>
<range>6,6</range>
<argument>feature</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RFAB680HTJC0D</review_id>
<text>Good value for money.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RJOGGH5QLKYM0</review_id>
<text>k</text>
<annotations/>
</review>
<review>
<review_id>R2LTDW0NT8RFDS</review_id>
<text>They work great.</text>
<annotations>
<annotation>
<range>0,0</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RKIGA3PG851ET</review_id>
<text>Works as advertised.</text>
<annotations/>
</review>
<review>
<review_id>R1JDZKOC29P29S</review_id>
<text>Really its very nice order</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R3PRMPI6NM38PV</review_id>
<text>What can I say my wife is fighting breast cancer and I want to show my support</text>
</review>
<review>
<review_id>RHLQ7TLNSHAYW</review_id>
<text>Nice quality, good buy.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R2IXUWWKIIXIYM</review_id>
<text>Not as good as expected, must try to use it more frequently so to get a better opinion</text>
<annotations>
<annotation>
<range>10,10</range>
<argument>product</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RRB4HZAN2FO3A</review_id>
<text>Great value over the GoPro site.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R3CRJRIDEOLX25</review_id>
<text>Nice camera</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R1P3X6PWMBUDD5</review_id>
<text>Best deal for the price.</text>
<annotations>
<annotation>
<range>4,4</range>
<argument>feature</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RSZIVXUFRCBUL</review_id>
<text>The size was too large and was not as listed.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RJH5M9S1BUSAQ</review_id>
<text>Always good quality</text>
<annotations>
<annotation>
<range>2,2</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R1BX8K01RZ8ZL5</review_id>
<text>Very easy to install, camera angle are wider then other brand, most importantly, supper easy to review or play back.</text>
<annotations>
<annotation>
<range>5,5</range>
<argument>product</argument>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>11,11</range>
<argument>feature</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R35L8YQLK9G6H5</review_id>
<text>Great value</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
</data>
\ No newline at end of file
<review_id>R32MKKAKXKKW0C</review_id>
<text>I bought the same Sling pack but smaller 7x, after buying a couple more lenses I quickly grew out of it. This 9x is large, but perfect for me. I still use the smaller one too. I will take the larger one on vacations.</text>
<sentences>
<sentence>
<tokens>I bought the same Sling pack but smaller 7x , after buying a couple more lenses I quickly grew out of it .</tokens>
<annotations>
<annotation>
<range>4,5</range>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>15,15</range>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>21,21</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>This 9x is large , but perfect for me .</tokens>
<annotations>
<annotation>
<range>0,1</range>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>I still use the smaller one too .</tokens>
<annotations>
<annotation>
<range>3,5</range>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>I will take the larger one on vacations .</tokens>
<annotations>
<annotation>
<range>3,5</range>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</sentence>
</sentences>
</review>
<review>
<review_id>R1S1HWFSIF618Z</review_id>
<text>Instructions were written in very disjointed English. Was expecting much more from this $100 product. Very disappointed!</text>
<sentences>
<sentence>
<tokens>Instructions were written in very disjointed English .</tokens>
<annotations>
<annotation>
<range>0,0</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>Was expecting much more from this $ 100 product .</tokens>
<annotations>
<annotation>
<range>5,8</range>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</sentence>
<sentence>
<tokens>Very disappointed !</tokens>
<annotations/>
</sentence>
</sentences>
</review>
</data>
<data><review><review_id>R1ZPQA9FFF258V</review_id><text>Inexpensive item that does what it says it does!</text><annotations><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation></annotations></review><review><review_id>R2CHMZYG2ANFNF</review_id><text>Great little siren, super loud.</text><annotations><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation></annotations></review><review><review_id>R3IL3NIR66ZIT1</review_id><text>Excellent article, very good product, I use it on a nikon D7100 and works very well.</text><annotations><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation></annotations></review><review><review_id>RITG7J2UJ8E8K</review_id><text>perfect</text><annotations /></review><review><review_id>R3FBMYUZMRDS2B</review_id><text>all right, thx</text><annotations /></review><review><review_id>R69WZSBAKCVGT</review_id><text>Excellent</text><annotations /></review><review><review_id>R2PMD968AXPVQE</review_id><text>Works great and fits in a case great!</text><annotations><annotation><argument>feature</argument><range>&lt;class 'range'&gt;</range><sentiment>neutral</sentiment></annotation></annotations></review><review><review_id>RFAB680HTJC0D</review_id><text>Good value for money.</text><annotations /></review></data>
\ No newline at end of file
This diff is collapsed.
......@@ -16,9 +16,10 @@ min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'}
annotated_reviews_location = 'annotated_camera_reviews2.xml'
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
prepared_reviews_location = 'prepared_amazon_camera_reviews.xml'
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
......@@ -64,8 +65,6 @@ def is_opinion_target(tree):
for sub in tree.subtrees()))
def prepare_reviews():
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
......@@ -87,11 +86,11 @@ def prepare_reviews():
# pick first n reviews
reviews = reviews.head(n)
print('Obtained', len(reviews), 'reviews')
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
root = Element('reviews')
for index, review in reviews.iterrows():
for _, review in reviews.iterrows():
review_node = SubElement(root, 'review')
review_node.set('annotated', 'false')
id_node = SubElement(review_node, 'review_id')
......@@ -103,6 +102,8 @@ def prepare_reviews():
text = review['review_body']
text = text.replace('<br />', '\n')
text = re.sub('[.][.]+', '...', text)
text = text.replace('&#34;', '"')
text = re.sub('[&][#][0-9]+[;]', ' ', text)
text_node.text = text
sentences_node = SubElement(review_node, 'sentences')
......@@ -121,30 +122,7 @@ def prepare_reviews():
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves())
ranges = []
for subtree in parse_tree.subtrees():
if is_opinion_target(subtree):
start, end = get_leaf_indices(parse_tree, subtree)
ranges.append((start, end, str(subtree)))
ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True)
ranges_to_delete = []
for range in ranges:
subranges = [other_range for other_range in ranges if other_range != range and range_contains(range, other_range)]
if range_cover(range, subranges):
ranges_to_delete.extend(subranges)
elif subranges:
ranges_to_delete.append(range)
unique_ranges = list(filter(lambda r: Tree.fromstring(r[2]).label() in nouns, set(ranges) - set(ranges_to_delete)))
unique_ranges.sort(key=(lambda t: t[0]))
phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges')
for range in unique_ranges:
phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range')
phrase_range_node.text = '{},{}'.format(range[0], range[1])
tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``','""')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
......@@ -152,91 +130,177 @@ def prepare_reviews():
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
print('Obtained and parsed', len(reviews), 'reviews')
def annotate_reviews():
row_character_count = 100
reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
reviews = parse(selected_reviews_location)
root = reviews.getroot()
# filter out reviews that have been annotated already
not_annotated = [review for review in root if review.attrib['annotated'] == 'false']
n_annotated = len(root) - len(not_annotated)
os.system('clear')
for review in not_annotated:
for sentence in review.find('sentences'):
tokens = sentence.find('tokenized_text').text.split(' ')
phrase_ranges = sentence.find('phrase_ranges')
non_ranges = []
for r in phrase_ranges:
print(bcolors.WARNING + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + '\'+\': positive sentiment'+ bcolors.ENDC)
print(bcolors.OKBLUE + '\'0\': neutral/no sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'-\': negative sentiment' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'n\': not an opinion target' + bcolors.ENDC)
print(bcolors.OKBLUE + '\'q\': quit' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.UNDERLINE + product_title + bcolors.ENDC)
print('')
start = int(r.text.split(',')[0])
end = int(r.text.split(',')[1])
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
if t in range(start, end+1):
token_text = bcolors.BOLD + token_text + bcolors.ENDC
text_row += token_text
if t + 1 < len(tokens) and len(text_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(text_row)
text_row = ''
print(text_row)
print('')
while True:
inp = input('Enter the sentiment expressed towards the highlighted argument: ')
if inp in ['+', '0', '-']:
r.set('sentiment_annotation', sentiment_mappings[inp])
os.system('clear')
break
if inp in ['n']:
non_ranges.append(r)
os.system('clear')
break
elif inp in ['q']:
os.system('clear')
break
if inp == 'q':
break
for non_range in non_ranges:
phrase_ranges.remove(non_range)
if inp == 'q':
os.system('clear')
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + 'annotation: [\'i\'|\'n,m\'] [\'+\'|\'0\'|\'-\']' + bcolors.ENDC)
print(bcolors.OKBLUE + 'continue: \'c\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print('')
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
index_row = ''
text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
print('')
annotations = []
while True:
task = input(': ')
if len(task.split(' ')) == 2:
rng = None
sentiment = ''
fst = task.split(' ')[0]
if fst.isdigit():
rng = (int(fst), int(fst))
elif (',' in fst and len(fst.split(',')) == 2 and
fst.split(',')[0].isdigit() and fst.split(',')[1].isdigit()):
rng = (int(fst.split(',')[0]), int(fst.split(',')[1]))
snd = task.split(' ')[1]
if snd in ['+', '0', '-']:
sentiment = snd
if rng and sentiment:
annotations.append((rng, sentiment))
if task in ['c', 's', 'q']:
if task in ['c']:
# save annotations to tree
annotations_node = SubElement(sentence, 'annotations')
for annotation in annotations:
annotation_node = SubElement(annotations_node, 'annotation')
range_node = SubElement(annotation_node, 'range')
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1])
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[1]]
break
if task == 'q':
break
if task == 'q':
os.system('clear')
break
if inp == 'q':
# save tree to file
n_annotated += 1
review.set('annotated', 'true')
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
def longest_common_subsequence(x, y):
seq = []
for i in range(min(len(x), len(y))):
if x[i] != y[i]:
break
else:
n_annotated += 1
review.set('annotated', 'true')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
seq.append(x[i])
return tuple(seq)
def labelled_tree_str(tree_str, start, end):
tree = Tree.fromstring(tree_str)
start_pos = tree.leaf_treeposition(start)
end_pos = tree.leaf_treeposition(end)
# find highest parent node common to start and end
if start == end:
parent_pos = start_pos[:len(start_pos)-1]
else:
parent_pos = longest_common_subsequence(start_pos, end_pos)
parent_node = tree[parent_pos]
while len(parent_node.parent()) == 1:
parent_node = parent_node.parent()
parent_pos = parent_pos[:len(parent_pos) - 1]
# remove branches between start and end inclusive
child_index_rng = range(start_pos[len(parent_pos)], end_pos[len(parent_pos)]+1)
child_positions = [list(parent_pos) + [i] for i in child_index_rng]
children_to_remove = [tree[tuple(child_pos)] for child_pos in child_positions]
for child in children_to_remove:
parent_node.remove(child)
# insert ARG in place of removed branches
parent_node.insert(child_index_rng[0], 'ARG')
return str(tree)
def prepare_annotated_reviews():
reviews = parse(selected_reviews_location)
root = reviews.getroot()
annotated = [review for review in root if review.attrib['annotated'] == 'true']
prepared_root = Element('data')
for review in annotated:
for sentence in review.find('sentences'):
text = sentence.find('text').text
tree_str = sentence.find('parse_tree').text
annotations = sentence.find('annotations') if sentence.find('annotations') else []
for annotation in annotations:
instance_node = SubElement(prepared_root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
op_node = SubElement(instance_node, 'opinion')
op_node.text = annotation.find('sentiment').text
tree_node = SubElement(instance_node, 'tree')
start, end = annotation.find('range').text.split(',')
tree_node.text = labelled_tree_str(tree_str, int(start), int(end))