Commit 7718ad0b authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Added SemEval data

parent 3f3df636
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from stanfordcorenlp import StanfordCoreNLP
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from nltk.tree import ParentedTree as Tree
import re
from xml.dom import minidom
import os
filepath = 'Laptops_Test_Gold.xml'
output = 'SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
root = Element('data')
opinion_labels = ['negative', 'neutral', 'positive', 'conflict']
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0,
'conflict': 0
}
input = parse(filepath)
for sentence in input.getroot():
text = sentence.find('text').text
# replace all occurrences of two or more . with standardised ...
text = re.sub('[.][.]+', '...', text)
if not sentence.find('aspectTerms'):
continue
for aspect_term in sentence.find('aspectTerms'):
arg_from = int(aspect_term.attrib['from'])
arg_to = int(aspect_term.attrib['to'])
opinion = aspect_term.attrib['polarity']
# get corenlp tree with argument in place
parse_tree_str = nlp.parse(text[:arg_from] + 'ARG' + text[arg_to:])
# replace argument with ARG in tree
parse_tree = Tree.fromstring(parse_tree_str)
# for subtree in parse_tree.subtrees():
# if ' '.join(subtree.leaves()) == argument:
# for child in list(subtree):
# subtree.remove(child)
# subtree.insert(0, 'ARG')
labelled_parse_tree_str = str(parse_tree)
instance_node = SubElement(root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
opinion_node = SubElement(instance_node, 'opinion')
opinion_node.text = opinion
opinion_tree_node = SubElement(instance_node, 'tree')
opinion_tree_node.text = labelled_parse_tree_str
prepared_counts[opinion] += 1
xmlstr = minidom.parseString(tostring(root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(output, 'w') as f:
f.write(xmlstr)
print(prepared_counts)
...@@ -9,6 +9,18 @@ from sklearn.metrics import confusion_matrix ...@@ -9,6 +9,18 @@ from sklearn.metrics import confusion_matrix
from instance import Instance from instance import Instance
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
import os import os
import math
def resample_data(instances, labels):
label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels}
max_n_instances = max([len(v) for v in label_instances.values()])
resampled_data = []
for label in labels:
m = math.ceil(max_n_instances / len(label_instances[label]))
label_instances[label] = (label_instances[label] * m)[:max_n_instances]
resampled_data += label_instances[label]
print(len(resampled_data))
return resampled_data
class SentimentAnalyzer: class SentimentAnalyzer:
...@@ -16,7 +28,7 @@ class SentimentAnalyzer: ...@@ -16,7 +28,7 @@ class SentimentAnalyzer:
def train_expr_clf(self, instances): def train_expr_clf(self, instances):
fvs = [instance.vector for instance in instances] fvs = [instance.vector for instance in instances]
targets = [instance.opinion != 'neutral' for instance in instances] targets = [instance.opinion for instance in instances]
self.expr_clf.fit(fvs, targets) self.expr_clf.fit(fvs, targets)
def get_feature_vector(self, instance): def get_feature_vector(self, instance):
...@@ -27,17 +39,24 @@ class SentimentAnalyzer: ...@@ -27,17 +39,24 @@ class SentimentAnalyzer:
def expresses_sentiment(self, instances): def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances]) return self.expr_clf.predict([instance.vector for instance in instances])
semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml'
semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
amazon_train_path = 'data/Amazon/amazon_camera_train.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml'
semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml' semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' # semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' #
tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml' tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml'
tweet_test_path = 'data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml' tweet_test_path = 'data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml'
train_path = tweet_train_path train_path = semeval_2014_train_path
test_path = tweet_test_path test_path = semeval_2014_test_path
labels = ['positive', 'neutral', 'negative', 'conflict']
sa = SentimentAnalyzer() sa = SentimentAnalyzer()
train_tree = ET.parse(train_path) train_tree = ET.parse(train_path)
train_instances = [Instance(instance) for instance in train_tree.getroot()] train_instances = [Instance(instance) for instance in train_tree.getroot()]
train_instances = resample_data(train_instances, labels)
# create and train vectorizer model # create and train vectorizer model
vec = Vectorizer(train_instances) vec = Vectorizer(train_instances)
...@@ -54,10 +73,10 @@ vec.vectorize(test_instances) ...@@ -54,10 +73,10 @@ vec.vectorize(test_instances)
# predict test set values # predict test set values
pred = sa.expresses_sentiment(test_instances) pred = sa.expresses_sentiment(test_instances)
targets = [instance.opinion != 'neutral' for instance in test_instances] targets = [instance.opinion for instance in test_instances]
# evaluate results # evaluate results
cm = confusion_matrix(targets, pred, labels=[True, False]) cm = confusion_matrix(targets, pred, labels=labels)
acc = len([i for i in range(len(targets)) if targets[i] == pred[i]]) / len(targets) acc = len([i for i in range(len(targets)) if targets[i] == pred[i]]) / len(targets)
print(cm) print(cm)
print('accuracy:', acc) print('accuracy:', acc)
...@@ -3,6 +3,7 @@ import numpy as np ...@@ -3,6 +3,7 @@ import numpy as np
import pandas as pd import pandas as pd
from feature_counter import FeatureCounter from feature_counter import FeatureCounter
from nltk.tree import ParentedTree as Tree from nltk.tree import ParentedTree as Tree
from nltk.stem import PorterStemmer
class Vectorizer: class Vectorizer:
...@@ -10,6 +11,7 @@ class Vectorizer: ...@@ -10,6 +11,7 @@ class Vectorizer:
negations = ['not', 'no', 'never', 'n\'t', 'neither', 'seldom', 'hardly'] negations = ['not', 'no', 'never', 'n\'t', 'neither', 'seldom', 'hardly']
copulas = ['is', '\'s', 'was', 'were'] copulas = ['is', '\'s', 'was', 'were']
adjetives_and_nouns = ['JJ', 'JJR', 'JJS', 'ADJP', 'NN', 'NNS', 'NP'] adjetives_and_nouns = ['JJ', 'JJR', 'JJS', 'ADJP', 'NN', 'NNS', 'NP']
ps = PorterStemmer()
def __init__(self, train_instances): def __init__(self, train_instances):
self.transformer = TfidfTransformer() self.transformer = TfidfTransformer()
...@@ -26,7 +28,7 @@ class Vectorizer: ...@@ -26,7 +28,7 @@ class Vectorizer:
train_dep_vectors = self.get_dep_vectors(train_instances, learning=True) train_dep_vectors = self.get_dep_vectors(train_instances, learning=True)
# store vectors for training set: # store vectors for training set:
train_vectors = train_indep_vectors # np.concatenate((train_indep_vectors, train_dep_vectors), axis=1) train_vectors = np.concatenate((train_indep_vectors, train_dep_vectors), axis=1)
train_vectors = self.transformer.fit_transform(train_vectors).toarray() train_vectors = self.transformer.fit_transform(train_vectors).toarray()
for i in range(len(train_instances)): for i in range(len(train_instances)):
...@@ -45,7 +47,7 @@ class Vectorizer: ...@@ -45,7 +47,7 @@ class Vectorizer:
# dep features: # dep features:
dep_vectors = self.get_dep_vectors(instances, learning=False) dep_vectors = self.get_dep_vectors(instances, learning=False)
# store vectors: # store vectors:
vectors = indep_vectors # np.concatenate((indep_vectors, dep_vectors), axis=1) vectors = np.concatenate((indep_vectors, dep_vectors), axis=1)
vectors = self.transformer.fit_transform(vectors).toarray() vectors = self.transformer.fit_transform(vectors).toarray()
for i in range(len(instances)): for i in range(len(instances)):
instances[i].vector = vectors[i] instances[i].vector = vectors[i]
...@@ -185,7 +187,7 @@ class Vectorizer: ...@@ -185,7 +187,7 @@ class Vectorizer:
# modified with inclusion of particles before and after ARG # modified with inclusion of particles before and after ARG
arg_pos = i + 1 arg_pos = i + 1
f = self.left_sibling_negation(subtree) f = self.left_sibling_negation(subtree)
f = f + subtree[i].leaves()[0].lower() f = f + self.ps.stem(subtree[i].leaves()[0].lower())
if self.is_particle(subtree[i+1]): if self.is_particle(subtree[i+1]):
f = f + '-' + subtree[i+1].leaves()[0].lower() f = f + '-' + subtree[i+1].leaves()[0].lower()
arg_pos = i + 2 arg_pos = i + 2
...@@ -200,23 +202,23 @@ class Vectorizer: ...@@ -200,23 +202,23 @@ class Vectorizer:
if self.is_verb(subtree[i+1][j]): if self.is_verb(subtree[i+1][j]):
if self.has_noun(subtree[i+1]): if self.has_noun(subtree[i+1]):
# rule 2: transitive verb with ARG as its subject # rule 2: transitive verb with ARG as its subject
f = self.sibling_negation(subtree[i+1][j]) + subtree[i+1][j].leaves()[0].lower() + '_arg1' f = self.sibling_negation(subtree[i+1][j]) + self.ps.stem(subtree[i+1][j].leaves()[0].lower()) + '_arg1'
features.append(f) features.append(f)
else: else:
# rule 3: intransitive verb with ARG as its subject # rule 3: intransitive verb with ARG as its subject
f = self.sibling_negation(subtree[i+1][j]) + subtree[i+1][j].leaves()[0].lower() + '_it_arg1' f = self.sibling_negation(subtree[i+1][j]) + self.ps.stem(subtree[i+1][j].leaves()[0].lower()) + '_it_arg1'
features.append(f) features.append(f)
if j + 1 < len(subtree[i+1]) and self.is_adverb(subtree[i+1][j+1]): if j + 1 < len(subtree[i+1]) and self.is_adverb(subtree[i+1][j+1]):
# rule 7: adverb modifies verb with ARG as subject # rule 7: adverb modifies verb with ARG as subject
main = self.main_phrase(subtree[i+1][j+1]) main = self.main_phrase(subtree[i+1][j+1])
f = 'arg1_v_' + self.sibling_negation(subtree[i+1][j+1]) + '-'.join(main.leaves()).lower() f = 'arg1_v_' + self.sibling_negation(subtree[i+1][j+1]) + self.ps.stem('-'.join(main.leaves()).lower())
features.append(f) features.append(f)
if self.is_arg_phrase(subtree): if self.is_arg_phrase(subtree):
for i in range(len(subtree)-1): for i in range(len(subtree)-1):
if self.is_adjective(subtree[i]): if self.is_adjective(subtree[i]):
# rule 4.1: adjective with ARG as its head # rule 4.1: adjective with ARG as its head
f = '-'.join(subtree[i].leaves()).lower() + '_arg1' f = self.ps.stem('-'.join(subtree[i].leaves()).lower()) + '_arg1'
if self.preceded_by_adverb(subtree[i]): if self.preceded_by_adverb(subtree[i]):
# addition to rules: include preceding adverb # addition to rules: include preceding adverb
f = subtree[i-1].leaves()[0] + '-' + f f = subtree[i-1].leaves()[0] + '-' + f
...@@ -224,7 +226,7 @@ class Vectorizer: ...@@ -224,7 +226,7 @@ class Vectorizer:
features.append(f) features.append(f)
if self.is_noun(subtree[i]) and subtree[i] == 'ARG': if self.is_noun(subtree[i]) and subtree[i] == 'ARG':
# rule 4.2: noun with ARG as its head # rule 4.2: noun with ARG as its head
f = self.left_sibling_negation(subtree[i]) + '-'.join(subtree[i].leaves()).lower() + '_arg1' f = self.left_sibling_negation(subtree[i]) + self.ps.stem('-'.join(subtree[i].leaves()).lower()) + '_arg1'
features.append(f) features.append(f)
if self.is_copula_phrase(subtree.right_sibling()): if self.is_copula_phrase(subtree.right_sibling()):
...@@ -233,7 +235,7 @@ class Vectorizer: ...@@ -233,7 +235,7 @@ class Vectorizer:
for n in self.get_descriptive_nodes(p): for n in self.get_descriptive_nodes(p):
# rule 5: adjective or noun connected by a copula with ARG # rule 5: adjective or noun connected by a copula with ARG
# extended to include adverb phrases, as these tend to include an adjective # extended to include adverb phrases, as these tend to include an adjective
f = self.sibling_negation(subtree.right_sibling()[0]) + '-'.join(n.leaves()) + '_cp_arg1' f = self.sibling_negation(subtree.right_sibling()[0]) + self.ps.stem('-'.join(n.leaves())) + '_cp_arg1'
features.append(f) features.append(f)
return features return features
...@@ -15,7 +15,7 @@ selected_reviews_location = 'reviews_to_be_annotated.xml' ...@@ -15,7 +15,7 @@ selected_reviews_location = 'reviews_to_be_annotated.xml'
min_characters = 0 min_characters = 0
max_characters = 200 max_characters = 200
n = 500 n = 500
sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative'} sentiment_mappings = {'+': 'positive', '0': 'neutral', '-': 'negative', 'c': 'conflict'}
annotated_reviews_location = 'annotated_camera_reviews.xml' annotated_reviews_location = 'annotated_camera_reviews.xml'
included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$'] included_labels = ['NN', 'NNS', 'NP', 'NNP', 'NNPS', 'DT', 'CD', 'FW', 'PRP$']
nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS'] nouns = ['NN', 'NNS', 'NP', 'NNP', 'NNPS']
...@@ -151,8 +151,8 @@ def annotate_reviews(): ...@@ -151,8 +151,8 @@ def annotate_reviews():
print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC) print(bcolors.OKBLUE + '{} reviews annotated'.format(n_annotated) + bcolors.ENDC)
print('') print('')
print(bcolors.OKBLUE + 'annotation: [\'i\'|\'n,m\'] [\'+\'|\'0\'|\'-\']' + bcolors.ENDC) print(bcolors.OKBLUE + 'annotation: [\'i\'|\'n,m\'] [\'+\'|\'0\'|\'-\'|\'c\']' + bcolors.ENDC)
print(bcolors.OKBLUE + 'continue: \'c\'' + bcolors.ENDC) print(bcolors.OKBLUE + 'next: \'n\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC) print(bcolors.OKBLUE + 'skip: \'s\'' + bcolors.ENDC)
print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC) print(bcolors.OKBLUE + 'quit: \'q\'' + bcolors.ENDC)
print('') print('')
...@@ -202,14 +202,14 @@ def annotate_reviews(): ...@@ -202,14 +202,14 @@ def annotate_reviews():
rng = (int(fst.split(',')[0]), int(fst.split(',')[1])) rng = (int(fst.split(',')[0]), int(fst.split(',')[1]))
snd = task.split(' ')[1] snd = task.split(' ')[1]
if snd in ['+', '0', '-']: if snd in sentiment_mappings.keys():
sentiment = snd sentiment = snd
if rng and sentiment: if rng and sentiment:
annotations.append((rng, sentiment)) annotations.append((rng, sentiment))
if task in ['c', 's', 'q']: if task in ['n', 's', 'q']:
if task in ['c']: if task in ['n']:
# save annotations to tree # save annotations to tree
annotations_node = SubElement(sentence, 'annotations') annotations_node = SubElement(sentence, 'annotations')
for annotation in annotations: for annotation in annotations:
...@@ -291,14 +291,30 @@ def prepare_annotated_reviews(): ...@@ -291,14 +291,30 @@ def prepare_annotated_reviews():
start, end = annotation.find('range').text.split(',') start, end = annotation.find('range').text.split(',')
tree_node.text = labelled_tree_str(tree_str, int(start), int(end)) tree_node.text = labelled_tree_str(tree_str, int(start), int(end))
counts = {'positive': 0, 'neutral': 0, 'negative': 0} train_count = 1000
train_root = Element('data')
test_root = Element('data')
counts = {'positive': 0, 'neutral': 0, 'negative': 0, 'conflict': 0}
for instance in prepared_root: for instance in prepared_root:
if counts[instance.find('opinion').text] < train_count:
train_root.append(instance)
else:
test_root.append(instance)
counts[instance.find('opinion').text] += 1 counts[instance.find('opinion').text] += 1
print(counts) print(counts)
print(len(train_root))
print(len(test_root))
xmlstr = minidom.parseString(tostring(train_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open('amazon_camera_train.xml', 'w') as f:
f.write(xmlstr)
xmlstr = minidom.parseString(tostring(prepared_root)).toprettyxml(indent=' ') xmlstr = minidom.parseString(tostring(test_root)).toprettyxml(indent=' ')
xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()]) xmlstr = os.linesep.join([s for s in xmlstr.splitlines() if s.strip()])
with open(prepared_reviews_location, 'w') as f: with open('amazon_camera_test.xml', 'w') as f:
f.write(xmlstr) f.write(xmlstr)
# prepare_reviews() # prepare_reviews()
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment