Commit 188b0d79 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented Word2Vec simple sentiment classifier using SemEval data, not working well

parent b2670b8a
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET
from stanfordcorenlp import StanfordCoreNLP
import re
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2.xml')
tree = ET.parse('EN_LAPT_SB1_TEST_.xml.gold')
reviews = tree.getroot()
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
......@@ -22,4 +22,4 @@ for review in reviews:
parse_tree.text = parse_tree_str
sentence.append(parse_tree)
tree.write('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
tree.write('ABSA16_Laptops_Test_with_parse_trees.xml')
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ElementTree, parse, Element, SubElement
from nltk.tokenize import word_tokenize
import string
from nltk.tree import ParentedTree as Tree
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
glossary = {
'laptop': ['computer', 'device', 'machine', 'price', 'cost', 'macbook', 'mac', 'pc', 'speed', 'it', 'this', 'product'],
'display': ['monitor', 'screen', 'touchscreen'],
......@@ -113,10 +109,21 @@ def replace_feature_nps_tree(feature, parse_tree, np_trees):
assert parse_tree != modified_tree
return modified_tree
tree = parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
n = len(reviews)
i = 0
prepped_opinions = 0
total_opinions = 0
prepared_opinions_count = 0
total_opinions_count = 0
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0
}
train_root = Element('data')
for review in reviews:
sentences = review[0]
......@@ -130,23 +137,52 @@ for review in reviews:
parse_tree_str = sentence.find('tree').text
parse_tree = Tree.fromstring(parse_tree_str)
nps = extract_extended_nouns(parse_tree)
opinion_trees = []
# attempt to identify opinion target in sentence
for opinion in opinions:
total_opinions += 1
total_opinions_count += 1
modified_tree = replace_feature_nps_tree(opinion[0], parse_tree, nps)
if modified_tree:
prepped_opinions += 1
print('---')
if modified_tree and prepared_counts[opinion[1]] < 500:
opinion_trees.append(modified_tree)
prepared_opinions_count += 1
prepared_counts[opinion[1]] += 1
print('review text:')
print(text)
print('')
print('tree:')
modified_tree.pretty_print()
print('modified text:')
print(' '.join(modified_tree.leaves()))
print('')
print('labelled opinion:')
print(opinion)
print('')
print('---')
# store in new train_tree
instance_node = SubElement(train_root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
opinion_node = SubElement(instance_node, 'opinion')
opinion_node.text = opinion[1]
opinion_tree_node = SubElement(instance_node, 'tree')
opinion_tree_node.text = str(modified_tree)
else:
pass
# print('---')
# print(text)
# print(nps)
# print(opinion)
i += 1
print('{}/{}'.format(i, n))
print('{}/{} opinions prepared'.format(prepped_opinions, total_opinions))
train_tree = ElementTree(train_root)
train_tree.write('ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml')
print('{}/{} opinions prepared'.format(prepared_opinions_count, total_opinions_count))
print(prepared_counts)
#########################
SocialSent Sentiment Data
#########################
This directory contains sentiment lexicons for the top 250 subreddits on Reddit (by comment count excluding non-English communities)
See http://nlp.stanford.edu/projects/socialsent for links to the accompanying paper, with details on the algorithm, seeds words, and data sources.
All files are .tsv's of the form:
<word> <mean_sentiment> <std_sentiment>
where mean_sentiment is the averaged inferred sentiment across bootstrap-sampled SentProp runs
and std_sentiment is the standard deviation of these samples.
SentProp was run with the following hyperparameters:
num nearest neighbors k=25
random walk beta=0.9
50 bootstrap samples of size 7
from nltk.tree import ParentedTree as Tree
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
class FeatureVector:
sentiment_lexicon = pd.read_csv('data/SocialSent/subreddit_sentiment_lexicons/technology.tsv', index_col=0, header=None, names=['mean', 'std'], sep='\t', error_bad_lines=False)
# in: sentence parse tree with labelled argument ARG
def __init__(self, tree, token_model):
self.vector = []
tokens = [token for token in tree.leaves() if token != 'ARG']
# target-independent features:
# words, punctuation using mean of Word2Vec vectors for tokens
self.vector.extend(np.mean(np.array([token_model.wv[token] for token in tokens]), axis=0))
# sentiment lexicon features
self.vector.extend(self.sentiment_scores(tokens))
# TODO: target-dependent features
def sentiment_scores(self, l):
pos = 0
neg = 0
for token in l:
if token in self.sentiment_lexicon.index:
mean, std = self.sentiment_lexicon.loc[token]
if mean < 0:
neg += abs(mean)
else:
pos += abs(mean)
return (pos / len(l), neg / len(l))
import xml.etree.ElementTree as ET
from nltk.tree import ParentedTree as Tree
from sklearn import svm
from feature_vector import FeatureVector
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from sklearn.metrics import confusion_matrix
import os
class SentimentAnalyzer:
expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG
token_model = Word2Vec.load('word2vec.model') if os.path.isfile('./word2vec.model') else None
def train_w2v_model(self, texts):
# path = get_tmpfile('word2vec.model')
self.token_model = Word2Vec(texts, size=100, window=5, min_count=1, workers=4)
self.token_model.save('word2vec.model')
def train_expr_clf(self, feature_vectors, targets):
print([fv.vector for fv in feature_vectors])
print(targets)
self.expr_clf.fit([fv.vector for fv in feature_vectors], targets)
def get_feature_vector(self, tree):
return FeatureVector(tree, self.token_model)
# in: sentence parse tree with labelled argument ARG
# out: true if sentence expresses sentiment towards ARG else false
def expresses_sentiment(self, feature_vectors):
return self.expr_clf.predict([fv.vector for fv in feature_vectors])
sa = SentimentAnalyzer()
train_tree = ET.parse('data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml')
instances = train_tree.getroot()
sa.train_w2v_model([Tree.fromstring(instance.find('tree').text).leaves() for instance in instances])
feature_vectors = [sa.get_feature_vector(Tree.fromstring(instance.find('tree').text)) for instance in instances]
targets = [instance.find('opinion').text for instance in instances]
sa.train_expr_clf(feature_vectors, targets)
test_tree = ET.parse('data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml')
test_instances = train_tree.getroot()
feature_vectors = [sa.get_feature_vector(Tree.fromstring(instance.find('tree').text)) for instance in test_instances]
targets = [instance.find('opinion').text for instance in test_instances]
pred = sa.expresses_sentiment(feature_vectors)
#print([(pred[i], targets[i]) for i in range(len(pred))])
cm = confusion_matrix(targets, pred, labels=['positive', 'neutral', 'negative'])
print(cm)
import requests
import threading
from anytree import Node
# rel = '/r/DefinedAs'
# rel2 = '/r/MadeOf'
# uri = '/query?start=/c/en/{feature}&rel={rel}&rel={rel2}'.format(feature=feature, rel=rel, rel2=rel2) # '/related/c/en/cpu?filter=/c/en'
#
# uri = '/c/en/{feature}'.format(feature=feature)
#
# print([(obj['edges'][i]['rel']['label'],obj['edges'][i]['end']['label']) for i in range(len(obj['edges']))])
import sys
class ConceptNet:
url = 'http://api.conceptnet.io'
limit = 5
......@@ -47,7 +39,7 @@ class ConceptNet:
synonyms.difference_update(rm)
self.parent_check(node, parent.parent, synonyms)
def synonyms_for_node(self, node):
def sem_synonyms_for_node(self, node):
rels = ['DefinedAs', 'Synonym', 'IsA', 'RelatedTo']
synonyms = set()
......@@ -66,10 +58,7 @@ class ConceptNet:
return synonyms
net = ConceptNet()
computer = Node('display')
cpu = Node('quality', parent=computer)
syns = net.synonyms_for_node(cpu)
parent = Node(str(sys.argv[1]))
child = Node(str(sys.argv[2]), parent=parent)
syns = net.sem_synonyms_for_node(child)
print(syns)
# print(requests.get('http://54.243.2.221' + '/c/en/computer').json())
# print(requests.get('http://54.243.2.221' + '/c/en/cpu').json())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment