Commit 1536e4ad authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Target-dependent features implemented, improvement of 0.0071 in accuracy...

Target-dependent features implemented, improvement of 0.0071 in accuracy compared to just target-indep features
parent 8b1ab0c1
......@@ -162,7 +162,7 @@ def replace_feature_nps_tree(feature, parse_tree, np_trees):
# print(mod)
tree = parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
tree = parse('ABSA16_Laptops_Test_with_parse_trees.xml')
reviews = tree.getroot()
n = len(reviews)
......@@ -234,7 +234,7 @@ for review in reviews:
train_tree = ElementTree(train_root)
train_tree.write('ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml')
train_tree.write('ABSA16_Laptops_Test_with_labelled_parse_trees.xml')
print('{}/{} opinions prepared'.format(prepared_opinions_count, total_opinions_count))
......
class FeatureCounter:
def __init__(self):
self.dep_features = {} # init as empty
self.n_dep_features = 0
def indexof(self, feature, learning):
if feature in self.dep_features:
return self.dep_features[feature]
elif learning:
self.dep_features[feature] = self.n_dep_features
self.n_dep_features += 1
return self.dep_features[feature]
else:
return None
def count(self):
return self.n_dep_features
......@@ -16,7 +16,7 @@ class SentimentAnalyzer:
def train_expr_clf(self, instances):
fvs = [instance.vector for instance in instances]
targets = [instance.opinion != 'neutral' for instance in instances]
targets = [instance.opinion for instance in instances]
self.expr_clf.fit(fvs, targets)
def get_feature_vector(self, instance):
......@@ -27,9 +27,12 @@ class SentimentAnalyzer:
def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances])
train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml' # 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
test_path = 'data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml' # 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml'
semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' #
tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml'
tweet_test_path = 'data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml'
train_path = semeval_train_path
test_path = semeval_test_path
sa = SentimentAnalyzer()
......@@ -51,10 +54,10 @@ vec.vectorize(test_instances)
# predict test set values
pred = sa.expresses_sentiment(test_instances)
targets = [instance.opinion != 'neutral' for instance in test_instances]
targets = [instance.opinion for instance in test_instances]
# evaluate results
cm = confusion_matrix(targets, pred, labels=[True, False])
cm = confusion_matrix(targets, pred, labels=['positive', 'neutral', 'negative'])
acc = len([i for i in range(len(targets)) if targets[i] == pred[i]]) / len(targets)
print(cm)
print('accuracy:', acc)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from feature_counter import FeatureCounter
from nltk.tree import ParentedTree as Tree
class Vectorizer:
sentiment_lexicon = pd.read_csv('data/SocialSent/2000.tsv', index_col=0, header=None, names=['mean', 'std'], sep='\t', error_bad_lines=False)
negations = ['not', 'no', 'never', 'n\'t', 'neither', 'seldom', 'hardly']
copulas = ['is', '\'s', 'was', 'were']
adjetives_and_nouns = ['JJ', 'JJR', 'JJS', 'ADJP', 'NN', 'NNS', 'NP']
def __init__(self, train_instances):
self.transformer = TfidfTransformer()
# indep features:
self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
texts = [instance.text for instance in train_instances]
......@@ -17,10 +22,11 @@ class Vectorizer:
train_indep_vectors = np.concatenate((train_bow_vectors, train_sent_vectors), axis=1)
# dep features:
train_dep_vectors = np.array([[]])
self.fc = FeatureCounter()
# train_dep_vectors = self.get_dep_vectors(train_instances, learning=True)
# store vectors for training set:
train_vectors = train_indep_vectors # np.concatenate((train_indep_vectors, train_dep_vectors), axis=1)
train_vectors = train_indep_vectors
train_vectors = self.transformer.fit_transform(train_vectors).toarray()
for i in range(len(train_instances)):
......@@ -37,9 +43,9 @@ class Vectorizer:
sent_vectors = [self.sentiment_scores(instance) for instance in instances]
indep_vectors = np.concatenate((bow_vectors, sent_vectors), axis=1)
# dep features:
dep_vectors = np.array([[]])
# dep_vectors = self.get_dep_vectors(instances, learning=False)
# store vectors:
vectors = indep_vectors # np.concatenate((indep_vectors, dep_vectors), axis=1)
vectors = indep_vectors
vectors = self.transformer.fit_transform(vectors).toarray()
for i in range(len(instances)):
instances[i].vector = vectors[i]
......@@ -56,3 +62,182 @@ class Vectorizer:
else:
pos += abs(mean)
return [pos / len(tokens), neg / len(tokens)]
def get_dep_vectors(self, instances, learning):
feature_indices = []
for instance in instances:
features = self.get_dep_features(instance)
feature_indices.append(set([self.fc.indexof(feature, learning) for feature in features]))
# fill tails with zeros until all vectors are of same length fc.count()
dep_vectors = np.zeros((len(instances), self.fc.count() + 1))
print(dep_vectors.shape)
for i in range(len(instances)):
for feature_index in feature_indices[i]:
if feature_index:
dep_vectors[i][feature_index] = 1
dep_vectors[i][len(dep_vectors[i])-1] = 1
return dep_vectors
def is_verb(self, t):
return (type(t) is Tree and
t.label() in ['VB', 'VBP', 'VBD', 'VBZ'] and
not self.is_copula(t))
def is_copula(self, t):
return (type(t) is Tree and
t.label() in ['VBD', 'VBZ'] and
t.leaves()[0] in self.copulas)
def has_arg(self, t):
return type(t) is Tree and t.label() == 'NP' and 'ARG' in t.leaves()
def is_arg_phrase(self, t):
return type(t) is Tree and t.label() == 'NP' and 'ARG' in t
def is_negation(self, t):
return type(t) is Tree and len(t.leaves()) == 1 and t.leaves()[0] in self.negations
def is_adjective(self, t):
return type(t) is Tree and t.label() in ['JJ', 'JJR', 'JJS', 'ADJP']
def is_particle(self, t):
return type(t) is Tree and t.label() == 'PRT'
def is_noun(self, t):
return type(t) is Tree and t.label() in ['NN', 'NP']
def is_adverb(self, t):
return (type(t) is Tree and t.label() in ['ADVP', 'RB', 'RBR', 'RBS'] and
not any(w.lower() in self.negations for w in t.leaves()))
def is_copula_phrase(self, t):
return type(t) is Tree and t.label() == 'VP' and self.is_copula(t[0])
def preceded_by_adverb(self, t):
p = t.left_sibling()
return p and p.label() in ['RB', 'RBR', 'RBS']
def has_noun(self, t):
return any(t.label() == 'NP' for t in t.subtrees())
def left_sibling_negation(self, t):
n = ''
left_sibling = t.left_sibling()
while left_sibling:
if self.is_negation(left_sibling):
n = 'neg-' + n
left_sibling = left_sibling.left_sibling()
return n
def sibling_negation(self, t):
n = ''
left_sibling = t.left_sibling()
while left_sibling:
if self.is_negation(left_sibling):
n = 'neg-' + n
left_sibling = left_sibling.left_sibling()
right_sibling = t.right_sibling()
while right_sibling:
if self.is_negation(right_sibling):
n = 'neg-' + n
right_sibling = right_sibling.right_sibling()
return n
# returns tree where possible sub-phrases of t have been removed
def main_phrase(self, t):
children = []
for n in t:
if type(n) is Tree and n.label() != 'SBAR':
children.append(self.main_phrase(n))
else:
return n
return Tree(t.label(), children)
# returns left-most node of t in ['ADVP', 'ADJP', 'NP']
def first_descriptive_phrase(self, t):
for n in t:
if n.label() in ['ADVP', 'ADJP', 'NP']:
return n
return None
def get_descriptive_nodes(self, t):
l = []
for n in t:
if type(n) is Tree:
if n.label() in self.adjetives_and_nouns:
l.append(n)
else:
l += self.get_descriptive_nodes(n)
return l
def get_dep_features(self, instance):
features = []
for subtree in instance.tree.subtrees():
if subtree.label() == 'VP':
for i in range(len(subtree)-1):
if (self.is_verb(subtree[i]) and
(self.has_arg(subtree[i+1]) or
(i + 2 < len(subtree) and self.is_particle(subtree[i+1]) and self.has_arg(subtree[i+2])))):
# rule 1: transitive verb with ARG as its object
# modified with inclusion of particles before and after ARG
arg_pos = i + 1
f = self.left_sibling_negation(subtree)
f = f + subtree[i].leaves()[0].lower()
if self.is_particle(subtree[i+1]):
f = f + '-' + subtree[i+1].leaves()[0].lower()
arg_pos = i + 2
if arg_pos + 1 < len(subtree) and self.is_particle(subtree[arg_pos+1]):
f = f + '-' + subtree[arg_pos+1].leaves()[0].lower()
f = f + '_arg2'
features.append(f)
for i in range(len(subtree)-1):
if self.has_arg(subtree[i]) and subtree[i+1].label() == 'VP':
for j in range(len(subtree[i+1])):
if self.is_verb(subtree[i+1][j]):
if self.has_noun(subtree[i+1]):
# rule 2: transitive verb with ARG as its subject
f = self.sibling_negation(subtree[i+1][j]) + subtree[i+1][j].leaves()[0].lower() + '_arg1'
features.append(f)
else:
# rule 3: intransitive verb with ARG as its subject
f = self.sibling_negation(subtree[i+1][j]) + subtree[i+1][j].leaves()[0].lower() + '_it_arg1'
features.append(f)
if j + 1 < len(subtree[i+1]) and self.is_adverb(subtree[i+1][j+1]):
# rule 7: adverb modifies verb with ARG as subject
main = self.main_phrase(subtree[i+1][j+1])
f = 'arg1_v_' + self.sibling_negation(subtree[i+1][j+1]) + '-'.join(main.leaves()).lower()
features.append(f)
if self.is_arg_phrase(subtree):
for i in range(len(subtree)-1):
if self.is_adjective(subtree[i]):
# rule 4.1: adjective with ARG as its head
f = '-'.join(subtree[i].leaves()).lower() + '_arg1'
if self.preceded_by_adverb(subtree[i]):
# addition to rules: include preceding adverb
f = subtree[i-1].leaves()[0] + '-' + f
f = self.left_sibling_negation(subtree[i]) + f
features.append(f)
if self.is_noun(subtree[i]) and subtree[i] == 'ARG':
# rule 4.2: noun with ARG as its head
f = self.left_sibling_negation(subtree[i]) + '-'.join(subtree[i].leaves()).lower() + '_arg1'
features.append(f)
if self.is_copula_phrase(subtree.right_sibling()):
p = self.first_descriptive_phrase(subtree.right_sibling())
if p:
for n in self.get_descriptive_nodes(p):
# rule 5: adjective or noun connected by a copula with ARG
# extended to include adverb phrases, as these tend to include an adjective
f = self.sibling_negation(subtree.right_sibling()[0]) + '-'.join(n.leaves()) + '_cp_arg1'
features.append(f)
if features:
print(instance.text)
print(instance.tree)
print(features)
return features
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment