Commit 16cb2dcc authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Changed review annotation to arrow key based

parent 52741466
...@@ -28,8 +28,8 @@ class BertAnalyzer: ...@@ -28,8 +28,8 @@ class BertAnalyzer:
self.net.load_state_dict(torch.load(trained_model_path)) self.net.load_state_dict(torch.load(trained_model_path))
self.net.eval() self.net.eval()
def train(self): def train(self, dataset):
train_data = BertDataset(semeval_2014_train_path) train_data = BertDataset(dataset)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch) collate_fn=generate_batch)
...@@ -65,8 +65,8 @@ class BertAnalyzer: ...@@ -65,8 +65,8 @@ class BertAnalyzer:
torch.save(net.state_dict(), trained_model_path) torch.save(net.state_dict(), trained_model_path)
def evaluate(self): def evaluate(self, dataset):
test_data = BertDataset(semeval_2014_test_path) test_data = BertDataset(dataset)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_batch) collate_fn=generate_batch)
...@@ -93,4 +93,4 @@ class BertAnalyzer: ...@@ -93,4 +93,4 @@ class BertAnalyzer:
sentiment_analyzer = BertAnalyzer() sentiment_analyzer = BertAnalyzer()
sentiment_analyzer.load_saved() sentiment_analyzer.load_saved()
sentiment_analyzer.evaluate() sentiment_analyzer.evaluate(semeval_2014_test_path)
\ No newline at end of file \ No newline at end of file
...@@ -8,6 +8,9 @@ import nltk.data ...@@ -8,6 +8,9 @@ import nltk.data
from stanfordcorenlp import StanfordCoreNLP from stanfordcorenlp import StanfordCoreNLP
from nltk.tree import ParentedTree as Tree from nltk.tree import ParentedTree as Tree
import re import re
import readchar
from sty import fg, bg, ef, rs
from wcwidth import wcswidth
data_location = 'amazon_reviews_us_Camera_v1_00.tsv' data_location = 'amazon_reviews_us_Camera_v1_00.tsv'
selected_reviews_location = 'reviews_to_be_annotated.xml' selected_reviews_location = 'reviews_to_be_annotated.xml'
...@@ -15,6 +18,7 @@ min_characters = 0 ...@@ -15,6 +18,7 @@ min_characters = 0
max_characters = 200 max_characters = 200
n = 500 n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'} sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
ann_bgs = {'positive': bg.green, 'neutral': bg.li_black, 'negative': bg.red, 'conflict': bg.yellow}
annotated_reviews_location = 'annotated_camera_reviews.xml' annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$'] included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS'] nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
...@@ -23,6 +27,7 @@ prepared_reviews_location = 'prepared_amazon_camera_reviews.xml' ...@@ -23,6 +27,7 @@ prepared_reviews_location = 'prepared_amazon_camera_reviews.xml'
tokenizer = TweetTokenizer() tokenizer = TweetTokenizer()
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
class bcolors: class bcolors:
HEADER = '\033[95m' HEADER = '\033[95m'
OKBLUE = '\033[94m' OKBLUE = '\033[94m'
...@@ -33,6 +38,7 @@ class bcolors: ...@@ -33,6 +38,7 @@ class bcolors:
BOLD = '\033[1m' BOLD = '\033[1m'
UNDERLINE = '\033[4m' UNDERLINE = '\033[4m'
def get_leaf_indices(tree, phrase_tree): def get_leaf_indices(tree, phrase_tree):
phrase_tree_pos = phrase_tree.treeposition() phrase_tree_pos = phrase_tree.treeposition()
start = 0 start = 0
...@@ -43,26 +49,31 @@ def get_leaf_indices(tree, phrase_tree): ...@@ -43,26 +49,31 @@ def get_leaf_indices(tree, phrase_tree):
end += 1 end += 1
return (start, end) return (start, end)
# true if r1 contains r2 # true if r1 contains r2
def range_contains(r1, r2): def range_contains(r1, r2):
return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees() return r1[0] <= r2[0] and r1[1] >= r2[1] and Tree.fromstring(r2[2]) in Tree.fromstring(r1[2]).subtrees()
def in_range(r, n): def in_range(r, n):
return r[0] <= n and r[1] >= n return r[0] <= n and r[1] >= n
# true if rs cover r # true if rs cover r
def range_cover(r, rs): def range_cover(r, rs):
for n in range(r[0],r[1]+1): for n in range(r[0], r[1] + 1):
if not any(in_range(other_r, n) for other_r in rs): if not any(in_range(other_r, n) for other_r in rs):
return False return False
return True return True
def is_opinion_target(tree): def is_opinion_target(tree):
return (tree.label() in included_labels and return (tree.label() in included_labels and
all(sub.label() in included_labels or all(sub.label() in included_labels or
(sub.label() == 'PRP' and sub[0].lower() == 'it') (sub.label() == 'PRP' and sub[0].lower() == 'it')
for sub in tree.subtrees())) for sub in tree.subtrees()))
def prepare_reviews(): def prepare_reviews():
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
...@@ -71,13 +82,13 @@ def prepare_reviews(): ...@@ -71,13 +82,13 @@ def prepare_reviews():
# try to filter out reviews for camera accessories # try to filter out reviews for camera accessories
filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag', filter_words = ['accessor', 'battery', 'charger', 'tripod', 'strap', 'case', 'bag',
'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security'] 'backpack', 'kit', 'printer', 'adapter', 'album', 'surveillance', 'security']
filter_pat = '' filter_pat = ''
for word in filter_words: for word in filter_words:
word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:] word_filter = '[' + word[0].upper() + word[0].lower() + ']' + word[1:]
filter_pat += word_filter + '|' filter_pat += word_filter + '|'
filter_pat = filter_pat[:-1] filter_pat = filter_pat[:-1]
reviews = reviews[~reviews['product_title'].str.contains(pat = filter_pat, regex = True)] reviews = reviews[~reviews['product_title'].str.contains(pat=filter_pat, regex=True)]
# shuffle reviews # shuffle reviews
reviews = reviews.sample(frac=1).reset_index(drop=True) reviews = reviews.sample(frac=1).reset_index(drop=True)
...@@ -121,7 +132,7 @@ def prepare_reviews(): ...@@ -121,7 +132,7 @@ def prepare_reviews():
parse_tree_node.text = parse_tree_str parse_tree_node.text = parse_tree_str
tokenized_text_node = SubElement(sentence_node, 'tokenized_text') tokenized_text_node = SubElement(sentence_node, 'tokenized_text')
tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``','""') tokenized_text_node.text = ' '.join(parse_tree.leaves()).replace('``', '""')
# save tree to file # save tree to file
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ') xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
...@@ -131,6 +142,7 @@ def prepare_reviews(): ...@@ -131,6 +142,7 @@ def prepare_reviews():
print('Obtained and parsed', len(reviews), 'reviews') print('Obtained and parsed', len(reviews), 'reviews')
def annotate_reviews(): def annotate_reviews():
row_character_count = 100 row_character_count = 100
reviews = parse(selected_reviews_location) reviews = parse(selected_reviews_location)
...@@ -143,69 +155,69 @@ def annotate_reviews(): ...@@ -143,69 +155,69 @@ def annotate_reviews():
for review in not_annotated: for review in not_annotated:
for sentence in review.find('sentences'): for sentence in review.find('sentences'):
tokens = sentence.find('tokenized_text').text.split(' ') text = sentence.find('text').text
cursor_pos = 0
os.system('clear') start = None
end = None
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + 'annotation: [\'i\'|\'n,m\'] [\'+\'|\'0\'|\'-\'|\'c\']' + bcolors.ENDC)
print(bcolors.OKBLUE + 'next: \'n\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
product_title = review.find('product_title').text
print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print('')
index_row = ''
text_row = ''
for t in range(len(tokens)):
space = len(tokens[t]) - len(str(t))
token_text = tokens[t] + ' '
index_text = str(t) + ' '
if space > 0:
index_text = ' ' * math.floor(space / 2) + index_text + ' ' * math.ceil(space / 2)
elif space < 0:
space = abs(space)
token_text = ' ' * math.floor(space / 2) + token_text + ' ' * math.ceil(space / 2)
index_row += index_text
text_row += token_text
if t + 1 < len(tokens) and len(index_row) + len(tokens[t + 1]) + 1 > row_character_count:
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
index_row = ''
text_row = ''
print(bcolors.WARNING + index_row + bcolors.ENDC)
print(text_row)
print('')
annotations = [] annotations = []
while True: while True:
task = input(': ') os.system('clear')
if len(task.split(' ')) == 2: print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
rng = None print('')
sentiment = ''
print(bcolors.OKBLUE + 'next: \'n\'' + bcolors.ENDC)
fst = task.split(' ')[0] print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
if fst.isdigit(): print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
rng = (int(fst), int(fst)) print('')
elif (',' in fst and len(fst.split(',')) == 2 and
fst.split(',')[0].isdigit() and fst.split(',')[1].isdigit()): product_title = review.find('product_title').text
rng = (int(fst.split(',')[0]), int(fst.split(',')[1])) print(bcolors.OKGREEN + product_title + bcolors.ENDC)
print('')
snd = task.split(' ')[1]
if snd in sentiment_mappings.keys(): text_row = ''
sentiment = snd for t in range(len(text)):
char = text[t]
if rng and sentiment: if t == cursor_pos:
annotations.append((rng, sentiment)) char = bg.blue + char + bg.rs
for ann in annotations:
if t in range(ann[0][0], ann[0][1]):
char = ann_bgs[ann[1]] + char + bg.rs
text_row += char
if (t + 1) % row_character_count == 0:
print(text_row)
text_row = ''
print(text_row)
print('')
task = readchar.readkey()
if task == readchar.key.RIGHT:
cursor_pos = min(cursor_pos + 1, len(text) - 1)
if task == readchar.key.LEFT:
cursor_pos = max(cursor_pos - 1, 0)
if task == readchar.key.DOWN:
cursor_pos = min(cursor_pos + row_character_count, len(text) - 1)
if task == readchar.key.UP:
cursor_pos = max(cursor_pos - row_character_count, 0)
if task == readchar.key.SPACE:
if start == None:
start = cursor_pos
elif end == None and cursor_pos >= start:
end = cursor_pos+1
rng = (start, end)
while True:
inp = input('Sentiment for {},{}: '.format(start, end-1))
if inp in sentiment_mappings.keys():
annotations.append((rng, sentiment_mappings[inp]))
start = None
end = None
cursor_pos = min(cursor_pos + 1, len(text) - 1)
break
if task in ['n', 's', 'q']: if task in ['n', 's', 'q']:
if task in ['n']: if task in ['n']:
...@@ -216,7 +228,7 @@ def annotate_reviews(): ...@@ -216,7 +228,7 @@ def annotate_reviews():
range_node = SubElement(annotation_node, 'range') range_node = SubElement(annotation_node, 'range')
range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1]) range_node.text = '{},{}'.format(annotation[0][0], annotation[0][1])
sent_node = SubElement(annotation_node, 'sentiment') sent_node = SubElement(annotation_node, 'sentiment')
sent_node.text = sentiment_mappings[annotation[1]] sent_node.text = annotation[1]
break break
if task == 'q': if task == 'q':
break break
...@@ -232,6 +244,7 @@ def annotate_reviews(): ...@@ -232,6 +244,7 @@ def annotate_reviews():
with open(selected_reviews_location, 'w') as f: with open(selected_reviews_location, 'w') as f:
f.write(xmlstr) f.write(xmlstr)
def longest_common_subsequence(x, y): def longest_common_subsequence(x, y):
seq = [] seq = []
for i in range(min(len(x), len(y))): for i in range(min(len(x), len(y))):
...@@ -241,6 +254,7 @@ def longest_common_subsequence(x, y): ...@@ -241,6 +254,7 @@ def longest_common_subsequence(x, y):
return tuple(seq) return tuple(seq)
def labelled_tree_str(tree_str, start, end): def labelled_tree_str(tree_str, start, end):
tree = Tree.fromstring(tree_str) tree = Tree.fromstring(tree_str)
start_pos = tree.leaf_treeposition(start) start_pos = tree.leaf_treeposition(start)
...@@ -248,7 +262,7 @@ def labelled_tree_str(tree_str, start, end): ...@@ -248,7 +262,7 @@ def labelled_tree_str(tree_str, start, end):
# find highest parent node common to start and end # find highest parent node common to start and end
if start == end: if start == end:
parent_pos = start_pos[:len(start_pos)-1] parent_pos = start_pos[:len(start_pos) - 1]
else: else:
parent_pos = longest_common_subsequence(start_pos, end_pos) parent_pos = longest_common_subsequence(start_pos, end_pos)
parent_node = tree[parent_pos] parent_node = tree[parent_pos]
...@@ -257,7 +271,7 @@ def labelled_tree_str(tree_str, start, end): ...@@ -257,7 +271,7 @@ def labelled_tree_str(tree_str, start, end):
parent_pos = parent_pos[:len(parent_pos) - 1] parent_pos = parent_pos[:len(parent_pos) - 1]
# remove branches between start and end inclusive # remove branches between start and end inclusive
child_index_rng = range(start_pos[len(parent_pos)], end_pos[len(parent_pos)]+1) child_index_rng = range(start_pos[len(parent_pos)], end_pos[len(parent_pos)] + 1)
child_positions = [list(parent_pos) + [i] for i in child_index_rng] child_positions = [list(parent_pos) + [i] for i in child_index_rng]
children_to_remove = [tree[tuple(child_pos)] for child_pos in child_positions] children_to_remove = [tree[tuple(child_pos)] for child_pos in child_positions]
for child in children_to_remove: for child in children_to_remove:
...@@ -268,27 +282,32 @@ def labelled_tree_str(tree_str, start, end): ...@@ -268,27 +282,32 @@ def labelled_tree_str(tree_str, start, end):
return str(tree) return str(tree)
def prepare_annotated_reviews(): def prepare_annotated_reviews():
reviews = parse(selected_reviews_location) reviews = parse(selected_reviews_location)
root = reviews.getroot() root = reviews.getroot()
annotated = [review for review in root if review.attrib['annotated'] == 'true'] annotated = [review for review in root if review.attrib['annotated'] == 'true']
prepared_root = Element('data') prepared_root = Element('sentences')
for review in annotated: for review in annotated:
for sentence in review.find('sentences'): for sentence in review.find('sentences'):
text = sentence.find('text').text text = sentence.find('text').text
tree_str = sentence.find('parse_tree').text tree_str = sentence.find('parse_tree').text
annotations = sentence.find('annotations') if sentence.find('annotations') else [] sentence_node = SubElement(prepared_root, 'sentence')
for annotation in annotations: text_node = SubElement(sentence_node, 'text')
instance_node = SubElement(prepared_root, 'instance') text_node.text = text
text_node = SubElement(instance_node, 'text')
text_node.text = text if sentence.find('annotations'):
op_node = SubElement(instance_node, 'opinion') aspect_terms_node = SubElement(sentence_node, 'aspectTerms')
op_node.text = annotation.find('sentiment').text
tree_node = SubElement(instance_node, 'tree') for annotation in sentence.find('annotations'):
start, end = annotation.find('range').text.split(',') start, end = annotation.find('range').text.split(',')
tree_node.text = labelled_tree_str(tree_str, int(start), int(end)) aspect_term_node = SubElement(aspect_terms_node, 'aspectTerm')
aspect_term_node.set('term', text[start:end])
aspect_term_node.set('polarity', annotation.find('sentiment').text)
aspect_term_node.set('from', start)
aspect_term_node.set('to', end)
train_count = 1000 train_count = 1000
train_root = Element('data') train_root = Element('data')
...@@ -316,6 +335,7 @@ def prepare_annotated_reviews(): ...@@ -316,6 +335,7 @@ def prepare_annotated_reviews():
with open('amazon_camera_test.xml', 'w') as f: with open('amazon_camera_test.xml', 'w') as f:
f.write(xmlstr) f.write(xmlstr)
# prepare_reviews() # prepare_reviews()
# annotate_reviews() annotate_reviews()
prepare_annotated_reviews() # prepare_annotated_reviews()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment