Commit 2c282c01 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented BoW feature vector achieving 0.664 on tweets.

parent 188b0d79
from stanfordcorenlp import StanfordCoreNLP
from xml.etree.ElementTree import ElementTree, parse, Element, SubElement
from nltk.tree import ParentedTree as Tree
from stanfordcorenlp import StanfordCoreNLP
import re
filepath = 'train.raw'
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0
}
train_root = Element('data')
opinion_labels = ['negative', 'neutral', 'positive']
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0
}
with open(filepath, 'r') as file:
data = file.read().split('\n')
for i in range(0, len(data), 3):
text = data[i]
argument = data[i+1]
opinion = opinion_labels[int(data[i+2]) + 1]
if ' . ' in text:
continue
# replace all occurrences of two or more . with standardised ...
text = re.sub('[.][.]+', '...', text)
# get corenlp tree with argument in place
parse_tree_str = nlp.parse(text.replace('$T$', argument))
# replace argument with ARG in tree
parse_tree = Tree.fromstring(parse_tree_str)
for subtree in parse_tree.subtrees():
if ' '.join(subtree.leaves()) == argument:
for child in list(subtree):
subtree.remove(child)
subtree.insert(0, 'ARG')
labelled_parse_tree_str = str(parse_tree)
instance_node = SubElement(train_root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text.replace('$T$', 'ARG')
opinion_node = SubElement(instance_node, 'opinion')
opinion_node.text = opinion
opinion_tree_node = SubElement(instance_node, 'tree')
opinion_tree_node.text = labelled_parse_tree_str
prepared_counts[opinion] += 1
train_tree = ElementTree(train_root)
train_tree.write('tweet_train_with_labelled_parse_trees.xml')
print(prepared_counts)
from nltk.tree import ParentedTree as Tree
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
class FeatureVector:
sentiment_lexicon = pd.read_csv('data/SocialSent/subreddit_sentiment_lexicons/technology.tsv', index_col=0, header=None, names=['mean', 'std'], sep='\t', error_bad_lines=False)
# in: sentence parse tree with labelled argument ARG
def __init__(self, tree, token_model):
self.vector = []
tokens = [token for token in tree.leaves() if token != 'ARG']
# target-independent features:
# words, punctuation using mean of Word2Vec vectors for tokens
self.vector.extend(np.mean(np.array([token_model.wv[token] for token in tokens]), axis=0))
# sentiment lexicon features
self.vector.extend(self.sentiment_scores(tokens))
# TODO: target-dependent features
def sentiment_scores(self, l):
pos = 0
neg = 0
for token in l:
if token in self.sentiment_lexicon.index:
mean, std = self.sentiment_lexicon.loc[token]
if mean < 0:
neg += abs(mean)
else:
pos += abs(mean)
return (pos / len(l), neg / len(l))
from nltk.tree import ParentedTree as Tree
class Instance:
def __init__(self, xml):
self.text = xml.find('text').text
self.opinion = xml.find('opinion').text
self.tree = Tree.fromstring(xml.find('tree').text)
import xml.etree.ElementTree as ET
from nltk.tree import ParentedTree as Tree
from sklearn import svm
from feature_vector import FeatureVector
from gensim.test.utils import common_texts, get_tmpfile
from vectorizer import Vectorizer
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics import confusion_matrix
from instance import Instance
from sklearn.feature_extraction.text import CountVectorizer
import os
class SentimentAnalyzer:
expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG
token_model = Word2Vec.load('word2vec.model') if os.path.isfile('./word2vec.model') else None
def train_w2v_model(self, texts):
# path = get_tmpfile('word2vec.model')
self.token_model = Word2Vec(texts, size=100, window=5, min_count=1, workers=4)
self.token_model.save('word2vec.model')
def train_expr_clf(self, instances):
fvs = [instance.vector for instance in instances]
targets = [instance.opinion != 'neutral' for instance in instances]
self.expr_clf.fit(fvs, targets)
def train_expr_clf(self, feature_vectors, targets):
print([fv.vector for fv in feature_vectors])
print(targets)
self.expr_clf.fit([fv.vector for fv in feature_vectors], targets)
def get_feature_vector(self, tree):
return FeatureVector(tree, self.token_model)
def get_feature_vector(self, instance):
return FeatureVector(instance, None)
# in: sentence parse tree with labelled argument ARG
# out: true if sentence expresses sentiment towards ARG else false
def expresses_sentiment(self, feature_vectors):
return self.expr_clf.predict([fv.vector for fv in feature_vectors])
def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances])
train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml' # 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
test_path = 'data/acl-14-short-data/tweet_test_with_labelled_parse_trees.xml' # 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml'
sa = SentimentAnalyzer()
train_tree = ET.parse('data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml')
instances = train_tree.getroot()
train_tree = ET.parse(train_path)
train_instances = [Instance(instance) for instance in train_tree.getroot()]
sa.train_w2v_model([Tree.fromstring(instance.find('tree').text).leaves() for instance in instances])
# create and train vectorizer model
vec = Vectorizer(train_instances)
feature_vectors = [sa.get_feature_vector(Tree.fromstring(instance.find('tree').text)) for instance in instances]
targets = [instance.find('opinion').text for instance in instances]
# train classifier for sentiment expression
sa.train_expr_clf(train_instances)
sa.train_expr_clf(feature_vectors, targets)
test_tree = ET.parse(test_path)
test_instances = [Instance(instance) for instance in test_tree.getroot()]
test_tree = ET.parse('data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml')
test_instances = train_tree.getroot()
# obtain feature vectors and targets for test set
vec.vectorize(test_instances)
feature_vectors = [sa.get_feature_vector(Tree.fromstring(instance.find('tree').text)) for instance in test_instances]
targets = [instance.find('opinion').text for instance in test_instances]
# predict test set values
pred = sa.expresses_sentiment(test_instances)
pred = sa.expresses_sentiment(feature_vectors)
#print([(pred[i], targets[i]) for i in range(len(pred))])
targets = [instance.opinion != 'neutral' for instance in test_instances]
cm = confusion_matrix(targets, pred, labels=['positive', 'neutral', 'negative'])
# evaluate results
cm = confusion_matrix(targets, pred, labels=[True, False])
acc = len([i for i in range(len(targets)) if targets[i] == pred[i]]) / len(targets)
print(cm)
print('accuracy:', acc)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
class Vectorizer:
sentiment_lexicon = pd.read_csv('data/SocialSent/2000.tsv', index_col=0, header=None, names=['mean', 'std'], sep='\t', error_bad_lines=False)
def __init__(self, train_instances):
self.transformer = TfidfTransformer()
# indep features:
self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
texts = [instance.text for instance in train_instances]
train_bow_vectors = self.bow_vectorizer.fit_transform(texts).toarray()
train_sent_vectors = [self.sentiment_scores(instance) for instance in train_instances]
train_indep_vectors = np.concatenate((train_bow_vectors, train_sent_vectors), axis=1)
# dep features:
train_dep_vectors = np.array([[]])
# store vectors for training set:
train_vectors = train_indep_vectors # np.concatenate((train_indep_vectors, train_dep_vectors), axis=1)
train_vectors = self.transformer.fit_transform(train_vectors).toarray()
for i in range(len(train_instances)):
train_instances[i].vector = train_vectors[i]
# print length of feature vector
print(len(train_vectors[0]))
# used in vectorizing test set
def vectorize(self, instances):
# indep features:
texts = [instance.text for instance in instances]
bow_vectors = self.bow_vectorizer.transform(texts).toarray()
sent_vectors = [self.sentiment_scores(instance) for instance in instances]
indep_vectors = np.concatenate((bow_vectors, sent_vectors), axis=1)
# dep features:
dep_vectors = np.array([[]])
# store vectors:
vectors = indep_vectors # np.concatenate((indep_vectors, dep_vectors), axis=1)
vectors = self.transformer.fit_transform(vectors).toarray()
for i in range(len(instances)):
instances[i].vector = vectors[i]
def sentiment_scores(self, instance):
tokens = instance.text.split(' ')
pos = 0
neg = 0
for token in tokens:
if token in self.sentiment_lexicon.index:
mean, std = self.sentiment_lexicon.loc[token]
if mean < 0:
neg += abs(mean)
else:
pos += abs(mean)
return [pos / len(tokens), neg / len(tokens)]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment