Commit 3299e4af authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

First review annotation implementation

parent 823ae4da
from stanfordcorenlp import StanfordCoreNLP
from xml.etree.ElementTree import ElementTree, parse, Element, SubElement
from nltk.tree import ParentedTree as Tree
from stanfordcorenlp import StanfordCoreNLP
import re
filepath = 'train.raw'
......
......@@ -16,7 +16,7 @@ class SentimentAnalyzer:
def train_expr_clf(self, instances):
fvs = [instance.vector for instance in instances]
targets = [instance.opinion for instance in instances]
targets = [instance.opinion != 'neutral' for instance in instances]
self.expr_clf.fit(fvs, targets)
def get_feature_vector(self, instance):
......@@ -31,8 +31,8 @@ semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelle
semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' #
tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml'
tweet_test_path = 'data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml'
train_path = semeval_train_path
test_path = semeval_test_path
train_path = tweet_train_path
test_path = tweet_test_path
sa = SentimentAnalyzer()
......@@ -54,10 +54,10 @@ vec.vectorize(test_instances)
# predict test set values
pred = sa.expresses_sentiment(test_instances)
targets = [instance.opinion for instance in test_instances]
targets = [instance.opinion != 'neutral' for instance in test_instances]
# evaluate results
cm = confusion_matrix(targets, pred, labels=['positive', 'neutral', 'negative'])
cm = confusion_matrix(targets, pred, labels=[True, False])
acc = len([i for i in range(len(targets)) if targets[i] == pred[i]]) / len(targets)
print(cm)
print('accuracy:', acc)
......@@ -26,7 +26,7 @@ class Vectorizer:
train_dep_vectors = self.get_dep_vectors(train_instances, learning=True)
# store vectors for training set:
train_vectors = np.concatenate((train_indep_vectors, train_dep_vectors), axis=1)
train_vectors = train_indep_vectors # np.concatenate((train_indep_vectors, train_dep_vectors), axis=1)
train_vectors = self.transformer.fit_transform(train_vectors).toarray()
for i in range(len(train_instances)):
......@@ -45,7 +45,7 @@ class Vectorizer:
# dep features:
dep_vectors = self.get_dep_vectors(instances, learning=False)
# store vectors:
vectors = np.concatenate((indep_vectors, dep_vectors), axis=1)
vectors = indep_vectors # np.concatenate((indep_vectors, dep_vectors), axis=1)
vectors = self.transformer.fit_transform(vectors).toarray()
for i in range(len(instances)):
instances[i].vector = vectors[i]
......
<?xml version="1.0" ?>
<data>
<review>
<review_id>R1ZPQA9FFF258V</review_id>
<text>Inexpensive item that does what it says it does!</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R2CHMZYG2ANFNF</review_id>
<text>Great little siren, super loud.</text>
<annotations>
<annotation>
<range>2,2</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R3IL3NIR66ZIT1</review_id>
<text>Excellent article, very good product, I use it on a nikon D7100 and works very well.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
<annotation>
<range>5,5</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RITG7J2UJ8E8K</review_id>
<text>perfect</text>
<annotations/>
</review>
<review>
<review_id>R3FBMYUZMRDS2B</review_id>
<text>all right, thx</text>
<annotations/>
</review>
<review>
<review_id>R69WZSBAKCVGT</review_id>
<text>Excellent</text>
<annotations/>
</review>
<review>
<review_id>R2PMD968AXPVQE</review_id>
<text>Works great and fits in a case great!</text>
<annotations>
<annotation>
<range>6,6</range>
<argument>feature</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RFAB680HTJC0D</review_id>
<text>Good value for money.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RJOGGH5QLKYM0</review_id>
<text>k</text>
<annotations/>
</review>
<review>
<review_id>R2LTDW0NT8RFDS</review_id>
<text>They work great.</text>
<annotations>
<annotation>
<range>0,0</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RKIGA3PG851ET</review_id>
<text>Works as advertised.</text>
<annotations/>
</review>
<review>
<review_id>R1JDZKOC29P29S</review_id>
<text>Really its very nice order</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R3PRMPI6NM38PV</review_id>
<text>What can I say my wife is fighting breast cancer and I want to show my support</text>
</review>
<review>
<review_id>RHLQ7TLNSHAYW</review_id>
<text>Nice quality, good buy.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R2IXUWWKIIXIYM</review_id>
<text>Not as good as expected, must try to use it more frequently so to get a better opinion</text>
<annotations>
<annotation>
<range>10,10</range>
<argument>product</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RRB4HZAN2FO3A</review_id>
<text>Great value over the GoPro site.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R3CRJRIDEOLX25</review_id>
<text>Nice camera</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>product</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R1P3X6PWMBUDD5</review_id>
<text>Best deal for the price.</text>
<annotations>
<annotation>
<range>4,4</range>
<argument>feature</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RSZIVXUFRCBUL</review_id>
<text>The size was too large and was not as listed.</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>negative</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>RJH5M9S1BUSAQ</review_id>
<text>Always good quality</text>
<annotations>
<annotation>
<range>2,2</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R1BX8K01RZ8ZL5</review_id>
<text>Very easy to install, camera angle are wider then other brand, most importantly, supper easy to review or play back.</text>
<annotations>
<annotation>
<range>5,5</range>
<argument>product</argument>
<sentiment>neutral</sentiment>
</annotation>
<annotation>
<range>11,11</range>
<argument>feature</argument>
<sentiment>neutral</sentiment>
</annotation>
</annotations>
</review>
<review>
<review_id>R35L8YQLK9G6H5</review_id>
<text>Great value</text>
<annotations>
<annotation>
<range>1,1</range>
<argument>feature</argument>
<sentiment>positive</sentiment>
</annotation>
</annotations>
</review>
</data>
\ No newline at end of file
<data><review><review_id>R1ZPQA9FFF258V</review_id><text>Inexpensive item that does what it says it does!</text><annotations><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation></annotations></review><review><review_id>R2CHMZYG2ANFNF</review_id><text>Great little siren, super loud.</text><annotations><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation></annotations></review><review><review_id>R3IL3NIR66ZIT1</review_id><text>Excellent article, very good product, I use it on a nikon D7100 and works very well.</text><annotations><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation><annotation><argument>product</argument><range>&lt;class 'range'&gt;</range><sentiment>positive</sentiment></annotation></annotations></review><review><review_id>RITG7J2UJ8E8K</review_id><text>perfect</text><annotations /></review><review><review_id>R3FBMYUZMRDS2B</review_id><text>all right, thx</text><annotations /></review><review><review_id>R69WZSBAKCVGT</review_id><text>Excellent</text><annotations /></review><review><review_id>R2PMD968AXPVQE</review_id><text>Works great and fits in a case great!</text><annotations><annotation><argument>feature</argument><range>&lt;class 'range'&gt;</range><sentiment>neutral</sentiment></annotation></annotations></review><review><review_id>RFAB680HTJC0D</review_id><text>Good value for money.</text><annotations /></review></data>
\ No newline at end of file
import pandas as pd
import math
from nltk.tokenize import TweetTokenizer
import os
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from datetime import datetime
from xml.dom import minidom
import nltk.data
from stanfordcorenlp import StanfordCoreNLP
from nltk.tree import ParentedTree as Tree
import re
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated2.xml'
min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'}
annotated_reviews_location = 'annotated_camera_reviews2.xml'
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def get_leaf_indices(tree, phrase_tree):
phrase_tree_pos = phrase_tree.treeposition()
start = 0
while tree.leaf_treeposition(start)[:len(phrase_tree_pos)] != phrase_tree_pos:
start += 1
end = start
while end + 1 < len(tree.leaves()) and tree.leaf_treeposition(end + 1)[:len(phrase_tree_pos)] == phrase_tree_pos:
end += 1
return (start, end)
# true if r1 contains r2
def range_contains(r1, r2):
return r1[0] <= r2[0] and r1[1] >= r2[1]
def in_range(r, n):
return r[0] <= n and r[1] >= n
# true if rs cover r
def range_cover(r, rs):
for n in range(r[0],r[1]+1):
for other_r in rs:
if in_range(other_r, n):
continue
return False
return True
def prepare_reviews():
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
# drop reviews with empty review body
reviews = reviews[~reviews['review_body'].isnull()]
# # select reviews with specified review_body length
# reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# # filter out reviews with more than one sentence
# reviews = reviews[~reviews['review_body'].str.contains(pat='[.][^.]|<br />|[!][^!]|[?][^?]', regex=True)]
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat = filter_pat, regex = True)]
# shuffle reviews
reviews = reviews.sample(frac=1).reset_index(drop=True)
# pick first n reviews
reviews = reviews.head(n)
print('Obtained', len(reviews), 'reviews')
root = Element('reviews')
for index, review in reviews.iterrows():
review_node = SubElement(root, 'review')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
title_node = SubElement(review_node, 'product_title')
title_node.text = review['product_title']
text_node = SubElement(review_node, 'review_body')
# reformat text
text = review['review_body']
text = re.sub('[.][.]+', '...', text)
text_node.text = text
sentences_node = SubElement(review_node, 'sentences')
sentences = sent_tokenizer.tokenize(text)
phrase_indices = []
for sentence in sentences:
sentence_node = SubElement(sentences_node, 'sentence')
sentence_text_node = SubElement(sentence_node, 'text')
sentence_text_node.text = sentence
parse_tree_str = nlp.parse(sentence)
parse_tree = Tree.fromstring(parse_tree_str)
tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves())
ranges = []
for subtree in parse_tree.subtrees():
if subtree.label() == 'NP':
start, end = get_leaf_indices(parse_tree, subtree)
ranges.append((start, end))
ranges.sort(key=(lambda t: t[1] - t[0]), reverse=True)
ranges_to_delete = []
for range in ranges:
subranges = [other_range for other_range in ranges if other_range != range and range_contains(range, other_range)]
if range_cover(range, subranges):
ranges_to_delete.extend(subranges)
elif subranges:
ranges_to_delete.append(range)
unique_ranges = list(set(ranges) - set(ranges_to_delete))
unique_ranges.sort(key=(lambda t: t[0]))
phrase_ranges_node = SubElement(sentence_node, 'phrase_ranges')
for range in unique_ranges:
phrase_range_node = SubElement(phrase_ranges_node, 'phrase_range')
phrase_range_node.text = '{},{}'.format(range[0], range[1])
# save selected reviews
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
# reviews.to_csv(selected_reviews_location, sep='\t', index=False)
def annotate_reviews():
row_character_count = 100
reviews = parse(selected_reviews_location) # pd.read_csv(selected_reviews_location, sep='\t', error_bad_lines=False)
annotated_reviews = parse(annotated_reviews_location) if os.path.isfile(annotated_reviews_location) else None
root = annotated_reviews.getroot() if annotated_reviews else Element('data')
n_annotated = len(root)
# filter out reviews that have been annotated already
annotated_review_ids = [id_node.text for id_node in root.iter('review_id')]
for review in reviews.findall('node2'):
if review.find('review_id').text in annotated_review_ids:
reviews.remove(review)
os.system('clear')
for review in reviews:
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
product_title = review['product_title']
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print('')
text = review['review_body']
tokens = tokenizer.tokenize(text)
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
index_row = ''
text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
print('')
annotations = []
while True:
task = input('Enter \'a\' to add an argument, \'c\' to save annotation and continue, \'s\' to skip, or \'q\' to quit: ')
if task == 'a':
rng = None
arg = ''
sentiment = ''
while not rng:
inp = input('Enter the index (range) of the argument (in the form x or x,y): ')
if inp.isdigit():
rng = (int(inp), int(inp))
elif (',' in inp and len(inp.split(',')) == 2 and
inp.split(',')[0].isdigit() and inp.split(',')[1].isdigit()):
rng = (int(inp.split(',')[0]), int(inp.split(',')[1]))
while not arg:
inp = input('Enter argument type (\'p\' for product, \'f\' for feature): ')
if inp in ['p', 'f']:
arg = inp
while not sentiment:
inp = input('Enter the sentiment (\'+\', \'0\', \'-\') expressed towards the argument: ')
if inp in ['+', '0', '-']:
sentiment = inp
annotations.append((rng, arg, sentiment))
if task in ['c', 's', 'q']:
if task in ['c', 's']:
n_annotated += 1
# save annotations to tree
review_node = SubElement(root, 'review')
id_node = SubElement(review_node, 'review_id')
id_node.text = review['review_id']
text_node = SubElement(review_node, 'text')
text_node.text = text
if task == 'c':
annotations_node = SubElement(review_node, 'annotations')
for annotation in annotations:
annotation_node = SubElement(annotations_node, 'annotation')
range_node = SubElement(annotation_node, 'range')
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1])
arg_node = SubElement(annotation_node, 'argument')
arg_node.text = 'product' if annotation[1] == 'p' else 'feature'
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[2]]
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(annotated_reviews_location, 'w') as f:
f.write(xmlstr)
os.system('clear')
break
if task == 'q':
break
# prepare_reviews()
# annotate_reviews()
prepare_reviews()
This diff is collapsed.
......@@ -30,7 +30,7 @@ reviews = reviews[~reviews['review_body'].isnull()]
reviews = reviews[reviews['review_body'].apply(lambda x: len(str(x)) >= min_characters and len(str(x)) <= max_characters)]
# filter out reviews with more than one phrase
reviews = reviews[~reviews['product_title'].str.contains(pat = separators, regex = True)]
reviews = reviews[~reviews['product_title'].str.contains(pat=separators, regex=True)]
# pick out highly positive and negative reviews
positive_reviews = reviews[reviews['star_rating'].apply(lambda x: x == 5)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment