Commit 16cb2dcc authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Changed review annotation to arrow key based

parent 52741466
......@@ -28,8 +28,8 @@ class BertAnalyzer:
self.net.load_state_dict(torch.load(trained_model_path))
self.net.eval()
def train(self):
train_data = BertDataset(semeval_2014_train_path)
def train(self, dataset):
train_data = BertDataset(dataset)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch)
......@@ -65,8 +65,8 @@ class BertAnalyzer:
torch.save(net.state_dict(), trained_model_path)
def evaluate(self):
test_data = BertDataset(semeval_2014_test_path)
def evaluate(self, dataset):
test_data = BertDataset(dataset)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_batch)
......@@ -93,4 +93,4 @@ class BertAnalyzer:
sentiment_analyzer = BertAnalyzer()
sentiment_analyzer.load_saved()
sentiment_analyzer.evaluate()
\ No newline at end of file
sentiment_analyzer.evaluate(semeval_2014_test_path)
\ No newline at end of file
......@@ -8,6 +8,9 @@ import nltk.data
from stanfordcorenlp import StanfordCoreNLP
from nltk.tree import ParentedTree as Tree
import re
import readchar
from sty import fg, bg, ef, rs
from wcwidth import wcswidth
data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated.xml'
......@@ -15,6 +18,7 @@ min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
ann_bgs = {'positive': bg.green, 'neutral': bg.li_black, 'negative': bg.red, 'conflict': bg.yellow}
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
......@@ -23,6 +27,7 @@ prepared_reviews_location = 'prepared_amazon_camera_reviews.xml'
tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
......@@ -33,6 +38,7 @@ class bcolors:
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def get_leaf_indices(tree, phrase_tree):
phrase_tree_pos = phrase_tree.treeposition()
start = 0
......@@ -43,26 +49,31 @@ def get_leaf_indices(tree, phrase_tree):
end += 1
return (start, end)
# true if r1 contains r2
def range_contains(r1, r2):
return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees()
def in_range(r, n):
return r[0] <= n and r[1] >= n
# true if rs cover r
def range_cover(r, rs):
for n in range(r[0],r[1]+1):
for n in range(r[0], r[1] + 1):
if not any(in_range(other_r, n) for other_r in rs):
return False
return False
return True
def is_opinion_target(tree):
return (tree.label() in included_labels and
all(sub.label() in included_labels or
(sub.label() == 'PRP' and sub[0].lower() == 'it')
for sub in tree.subtrees()))
def prepare_reviews():
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
......@@ -71,13 +82,13 @@ def prepare_reviews():
# try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = ''
for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat = filter_pat, regex = True)]
reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)]
# shuffle reviews
reviews = reviews.sample(frac=1).reset_index(drop=True)
......@@ -121,7 +132,7 @@ def prepare_reviews():
parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``','""')
tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``', '""')
# save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
......@@ -131,6 +142,7 @@ def prepare_reviews():
print('Obtained and parsed', len(reviews), 'reviews')
def annotate_reviews():
row_character_count = 100
reviews = parse(selected_reviews_location)
......@@ -143,69 +155,69 @@ def annotate_reviews():
for review in not_annotated:
for sentence in review.find('sentences'):
tokens = sentence.find('tokenized_text').text.split(' ')
os.system('clear')
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + 'annotation: [\'i\'|\'n,m\'] [\'+\'|\'0\'|\'-\'|\'c\']' + bcolors.ENDC)
print(bcolors.OKBLUE + 'next: \'n\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print('')
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
index_row = ''
text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
print('')
text = sentence.find('text').text
cursor_pos = 0
start = None
end = None
annotations = []
while True:
task = input(': ')
if len(task.split(' ')) == 2:
rng = None
sentiment = ''
fst = task.split(' ')[0]
if fst.isdigit():
rng = (int(fst), int(fst))
elif (',' in fst and len(fst.split(',')) == 2 and
fst.split(',')[0].isdigit() and fst.split(',')[1].isdigit()):
rng = (int(fst.split(',')[0]), int(fst.split(',')[1]))
snd = task.split(' ')[1]
if snd in sentiment_mappings.keys():
sentiment = snd
if rng and sentiment:
annotations.append((rng, sentiment))
os.system('clear')
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + 'next: \'n\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print('')
text_row = ''
for t in range(len(text)):
char = text[t]
if t == cursor_pos:
char = bg.blue + char + bg.rs
for ann in annotations:
if t in range(ann[0][0], ann[0][1]):
char = ann_bgs[ann[1]] + char + bg.rs
text_row += char
if (t + 1) % row_character_count == 0:
print(text_row)
text_row = ''
print(text_row)
print('')
task = readchar.readkey()
if task == readchar.key.RIGHT:
cursor_pos = min(cursor_pos + 1, len(text) - 1)
if task == readchar.key.LEFT:
cursor_pos = max(cursor_pos - 1, 0)
if task == readchar.key.DOWN:
cursor_pos = min(cursor_pos + row_character_count, len(text) - 1)
if task == readchar.key.UP:
cursor_pos = max(cursor_pos - row_character_count, 0)
if task == readchar.key.SPACE:
if start == None:
start = cursor_pos
elif end == None and cursor_pos >= start:
end = cursor_pos+1
rng = (start, end)
while True:
inp = input('Sentiment for {},{}: '.format(start, end-1))
if inp in sentiment_mappings.keys():
annotations.append((rng, sentiment_mappings[inp]))
start = None
end = None
cursor_pos = min(cursor_pos + 1, len(text) - 1)
break
if task in ['n', 's', 'q']:
if task in ['n']:
......@@ -216,7 +228,7 @@ def annotate_reviews():
range_node = SubElement(annotation_node, 'range')
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1])
sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[1]]
sent_node.text = annotation[1]
break
if task == 'q':
break
......@@ -232,6 +244,7 @@ def annotate_reviews():
with open(selected_reviews_location, 'w') as f:
f.write(xmlstr)
def longest_common_subsequence(x, y):
seq = []
for i in range(min(len(x), len(y))):
......@@ -241,6 +254,7 @@ def longest_common_subsequence(x, y):
return tuple(seq)
def labelled_tree_str(tree_str, start, end):
tree = Tree.fromstring(tree_str)
start_pos = tree.leaf_treeposition(start)
......@@ -248,7 +262,7 @@ def labelled_tree_str(tree_str, start, end):
# find highest parent node common to start and end
if start == end:
parent_pos = start_pos[:len(start_pos)-1]
parent_pos = start_pos[:len(start_pos) - 1]
else:
parent_pos = longest_common_subsequence(start_pos, end_pos)
parent_node = tree[parent_pos]
......@@ -257,7 +271,7 @@ def labelled_tree_str(tree_str, start, end):
parent_pos = parent_pos[:len(parent_pos) - 1]
# remove branches between start and end inclusive
child_index_rng = range(start_pos[len(parent_pos)], end_pos[len(parent_pos)]+1)
child_index_rng = range(start_pos[len(parent_pos)], end_pos[len(parent_pos)] + 1)
child_positions = [list(parent_pos) + [i] for i in child_index_rng]
children_to_remove = [tree[tuple(child_pos)] for child_pos in child_positions]
for child in children_to_remove:
......@@ -268,27 +282,32 @@ def labelled_tree_str(tree_str, start, end):
return str(tree)
def prepare_annotated_reviews():
reviews = parse(selected_reviews_location)
root = reviews.getroot()
annotated = [review for review in root if review.attrib['annotated'] == 'true']
prepared_root = Element('data')
prepared_root = Element('sentences')
for review in annotated:
for sentence in review.find('sentences'):
text = sentence.find('text').text
tree_str = sentence.find('parse_tree').text
annotations = sentence.find('annotations') if sentence.find('annotations') else []
for annotation in annotations:
instance_node = SubElement(prepared_root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
op_node = SubElement(instance_node, 'opinion')
op_node.text = annotation.find('sentiment').text
tree_node = SubElement(instance_node, 'tree')
start, end = annotation.find('range').text.split(',')
tree_node.text = labelled_tree_str(tree_str, int(start), int(end))
sentence_node = SubElement(prepared_root, 'sentence')
text_node = SubElement(sentence_node, 'text')
text_node.text = text
if sentence.find('annotations'):
aspect_terms_node = SubElement(sentence_node, 'aspectTerms')
for annotation in sentence.find('annotations'):
start, end = annotation.find('range').text.split(',')
aspect_term_node = SubElement(aspect_terms_node, 'aspectTerm')
aspect_term_node.set('term', text[start:end])
aspect_term_node.set('polarity', annotation.find('sentiment').text)
aspect_term_node.set('from', start)
aspect_term_node.set('to', end)
train_count = 1000
train_root = Element('data')
......@@ -316,6 +335,7 @@ def prepare_annotated_reviews():
with open('amazon_camera_test.xml', 'w') as f:
f.write(xmlstr)
# prepare_reviews()
# annotate_reviews()
prepare_annotated_reviews()
annotate_reviews()
# prepare_annotated_reviews()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment