Commit 7718ad0b authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Added SemEval data

parent 3f3df636
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from stanfordcorenlp import StanfordCoreNLP
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from nltk.tree import ParentedTree as Tree
import re
from xml.dom import minidom
import os
filepath = 'Laptops_Test_Gold.xml'
output = 'SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
root = Element('data')
opinion_labels = ['negative', 'neutral', 'positive', 'conflict']
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0,
'conflict': 0
}
input = parse(filepath)
for sentence in input.getroot():
text = sentence.find('text').text
# replace all occurrences of two or more . with standardised ...
text = re.sub('[.][.]+', '...', text)
if not sentence.find('aspectTerms'):
continue
for aspect_term in sentence.find('aspectTerms'):
arg_from = int(aspect_term.attrib['from'])
arg_to = int(aspect_term.attrib['to'])
opinion = aspect_term.attrib['polarity']
# get corenlp tree with argument in place
parse_tree_str = nlp.parse(text[:arg_from] + 'ARG' + text[arg_to:])
# replace argument with ARG in tree
parse_tree = Tree.fromstring(parse_tree_str)
# for subtree in parse_tree.subtrees():
# if ' '.join(subtree.leaves()) == argument:
# for child in list(subtree):
# subtree.remove(child)
# subtree.insert(0, 'ARG')
labelled_parse_tree_str = str(parse_tree)
instance_node = SubElement(root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
opinion_node = SubElement(instance_node, 'opinion')
opinion_node.text = opinion
opinion_tree_node = SubElement(instance_node, 'tree')
opinion_tree_node.text = labelled_parse_tree_str
prepared_counts[opinion] += 1
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(output, 'w') as f:
f.write(xmlstr)
print(prepared_counts)
......@@ -9,6 +9,18 @@ from sklearn.metrics import confusion_matrix
from instance import Instance
from sklearn.feature_extraction.text import CountVectorizer
import os
import math
def resample_data(instances, labels):
label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels}
max_n_instances = max([len(v) for v in label_instances.values()])
resampled_data = []
for label in labels:
m = math.ceil(max_n_instances / len(label_instances[label]))
label_instances[label] = (label_instances[label] * m)[:max_n_instances]
resampled_data += label_instances[label]
print(len(resampled_data))
return resampled_data
class SentimentAnalyzer:
......@@ -16,7 +28,7 @@ class SentimentAnalyzer:
def train_expr_clf(self, instances):
fvs = [instance.vector for instance in instances]
targets = [instance.opinion != 'neutral' for instance in instances]
targets = [instance.opinion for instance in instances]
self.expr_clf.fit(fvs, targets)
def get_feature_vector(self, instance):
......@@ -27,17 +39,24 @@ class SentimentAnalyzer:
def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances])
semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml'
semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
amazon_train_path = 'data/Amazon/amazon_camera_train.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml'
semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' #
tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml'
tweet_test_path = 'data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml'
train_path = tweet_train_path
test_path = tweet_test_path
train_path = semeval_2014_train_path
test_path = semeval_2014_test_path
labels = ['positive', 'neutral', 'negative', 'conflict']
sa = SentimentAnalyzer()
train_tree = ET.parse(train_path)
train_instances = [Instance(instance) for instance in train_tree.getroot()]
train_instances = resample_data(train_instances, labels)
# create and train vectorizer model
vec = Vectorizer(train_instances)
......@@ -54,10 +73,10 @@ vec.vectorize(test_instances)
# predict test set values
pred = sa.expresses_sentiment(test_instances)
targets = [instance.opinion != 'neutral' for instance in test_instances]
targets = [instance.opinion for instance in test_instances]
# evaluate results
cm = confusion_matrix(targets, pred, labels=[True, False])
cm = confusion_matrix(targets, pred, labels=labels)
acc = len([i for i in range(len(targets)) if targets[i] == pred[i]]) / len(targets)
print(cm)
print('accuracy:', acc)
......@@ -3,6 +3,7 @@ import numpy as np
import pandas as pd
from feature_counter import FeatureCounter
from nltk.tree import ParentedTree as Tree
from nltk.stem import PorterStemmer
class Vectorizer:
......@@ -10,6 +11,7 @@ class Vectorizer:
negations = ['not', 'no', 'never', 'n\'t', 'neither', 'seldom', 'hardly']
copulas = ['is', '\'s', 'was', 'were']
adjetives_and_nouns = ['JJ', 'JJR', 'JJS', 'ADJP', 'NN', 'NNS', 'NP']
ps = PorterStemmer()
def __init__(self, train_instances):
self.transformer = TfidfTransformer()
......@@ -26,7 +28,7 @@ class Vectorizer:
train_dep_vectors = self.get_dep_vectors(train_instances, learning=True)
# store vectors for training set:
train_vectors = train_indep_vectors # np.concatenate((train_indep_vectors, train_dep_vectors), axis=1)
train_vectors = np.concatenate((train_indep_vectors, train_dep_vectors), axis=1)
train_vectors = self.transformer.fit_transform(train_vectors).toarray()
for i in range(len(train_instances)):
......@@ -45,7 +47,7 @@ class Vectorizer:
# dep features:
dep_vectors = self.get_dep_vectors(instances, learning=False)
# store vectors:
vectors = indep_vectors # np.concatenate((indep_vectors, dep_vectors), axis=1)
vectors = np.concatenate((indep_vectors, dep_vectors), axis=1)
vectors = self.transformer.fit_transform(vectors).toarray()
for i in range(len(instances)):
instances[i].vector = vectors[i]
......@@ -185,7 +187,7 @@ class Vectorizer:
# modified with inclusion of particles before and after ARG
arg_pos = i + 1
f = self.left_sibling_negation(subtree)
f = f + subtree[i].leaves()[0].lower()
f = f + self.ps.stem(subtree[i].leaves()[0].lower())
if self.is_particle(subtree[i+1]):
f = f + '-' + subtree[i+1].leaves()[0].lower()
arg_pos = i + 2
......@@ -200,23 +202,23 @@ class Vectorizer:
if self.is_verb(subtree[i+1][j]):
if self.has_noun(subtree[i+1]):
# rule 2: transitive verb with ARG as its subject
f = self.sibling_negation(subtree[i+1][j]) + subtree[i+1][j].leaves()[0].lower() + '_arg1'
f = self.sibling_negation(subtree[i+1][j]) + self.ps.stem(subtree[i+1][j].leaves()[0].lower()) + '_arg1'
features.append(f)
else:
# rule 3: intransitive verb with ARG as its subject
f = self.sibling_negation(subtree[i+1][j]) + subtree[i+1][j].leaves()[0].lower() + '_it_arg1'
f = self.sibling_negation(subtree[i+1][j]) + self.ps.stem(subtree[i+1][j].leaves()[0].lower()) + '_it_arg1'
features.append(f)
if j + 1 < len(subtree[i+1]) and self.is_adverb(subtree[i+1][j+1]):
# rule 7: adverb modifies verb with ARG as subject
main = self.main_phrase(subtree[i+1][j+1])
f = 'arg1_v_' + self.sibling_negation(subtree[i+1][j+1]) + '-'.join(main.leaves()).lower()
f = 'arg1_v_' + self.sibling_negation(subtree[i+1][j+1]) + self.ps.stem('-'.join(main.leaves()).lower())
features.append(f)
if self.is_arg_phrase(subtree):
for i in range(len(subtree)-1):
if self.is_adjective(subtree[i]):
# rule 4.1: adjective with ARG as its head
f = '-'.join(subtree[i].leaves()).lower() + '_arg1'
f = self.ps.stem('-'.join(subtree[i].leaves()).lower()) + '_arg1'
if self.preceded_by_adverb(subtree[i]):
# addition to rules: include preceding adverb
f = subtree[i-1].leaves()[0] + '-' + f
......@@ -224,7 +226,7 @@ class Vectorizer:
features.append(f)
if self.is_noun(subtree[i]) and subtree[i] == 'ARG':
# rule 4.2: noun with ARG as its head
f = self.left_sibling_negation(subtree[i]) + '-'.join(subtree[i].leaves()).lower() + '_arg1'
f = self.left_sibling_negation(subtree[i]) + self.ps.stem('-'.join(subtree[i].leaves()).lower()) + '_arg1'
features.append(f)
if self.is_copula_phrase(subtree.right_sibling()):
......@@ -233,7 +235,7 @@ class Vectorizer:
for n in self.get_descriptive_nodes(p):
# rule 5: adjective or noun connected by a copula with ARG
# extended to include adverb phrases, as these tend to include an adjective
f = self.sibling_negation(subtree.right_sibling()[0]) + '-'.join(n.leaves()) + '_cp_arg1'
f = self.sibling_negation(subtree.right_sibling()[0]) + self.ps.stem('-'.join(n.leaves())) + '_cp_arg1'
features.append(f)
return features
......@@ -15,7 +15,7 @@ selected_reviews_location = 'reviews_to_be_annotated.xml'
min_characters = 0
max_characters = 200
n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'}
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
......@@ -151,8 +151,8 @@ def annotate_reviews():
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('')
print(bcolors.OKBLUE + 'annotation: [\'i\'|\'n,m\'] [\'+\'|\'0\'|\'-\']' + bcolors.ENDC)
print(bcolors.OKBLUE + 'continue: \'c\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'annotation: [\'i\'|\'n,m\'] [\'+\'|\'0\'|\'-\'|\'c\']' + bcolors.ENDC)
print(bcolors.OKBLUE + 'next: \'n\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('')
......@@ -202,14 +202,14 @@ def annotate_reviews():
rng = (int(fst.split(',')[0]), int(fst.split(',')[1]))
snd = task.split(' ')[1]
if snd in ['+', '0', '-']:
if snd in sentiment_mappings.keys():
sentiment = snd
if rng and sentiment:
annotations.append((rng, sentiment))
if task in ['c', 's', 'q']:
if task in ['c']:
if task in ['n', 's', 'q']:
if task in ['n']:
# save annotations to tree
annotations_node = SubElement(sentence, 'annotations')
for annotation in annotations:
......@@ -291,14 +291,30 @@ def prepare_annotated_reviews():
start, end = annotation.find('range').text.split(',')
tree_node.text = labelled_tree_str(tree_str, int(start), int(end))
counts = {'positive': 0, 'neutral': 0, 'negative': 0}
train_count = 1000
train_root = Element('data')
test_root = Element('data')
counts = {'positive': 0, 'neutral': 0, 'negative': 0, 'conflict': 0}
for instance in prepared_root:
if counts[instance.find('opinion').text] < train_count:
train_root.append(instance)
else:
test_root.append(instance)
counts[instance.find('opinion').text] += 1
print(counts)
print(len(train_root))
print(len(test_root))
xmlstr = minidom.parseString(tostring(train_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open('amazon_camera_train.xml', 'w') as f:
f.write(xmlstr)
xmlstr = minidom.parseString(tostring(prepared_root)).toprettyxml(indent=' ')
xmlstr = minidom.parseString(tostring(test_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(prepared_reviews_location, 'w') as f:
with open('amazon_camera_test.xml', 'w') as f:
f.write(xmlstr)
# prepare_reviews()
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment