Commit 188b0d79 authored by  Joel  Oksanen's avatar Joel Oksanen
Browse files

Implemented Word2Vec simple sentiment classifier using SemEval data, not working well

parent b2670b8a
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET
from stanfordcorenlp import StanfordCoreNLP
import re
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2.xml')
tree = ET.parse('EN_LAPT_SB1_TEST_.xml.gold')
reviews = tree.getroot()
nlp = StanfordCoreNLP(r'/Users/joeloksanen/stanford-corenlp-full-2018-10-05')
......@@ -22,4 +22,4 @@ for review in reviews:
parse_tree.text = parse_tree_str
sentence.append(parse_tree)
tree.write('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
tree.write('ABSA16_Laptops_Test_with_parse_trees.xml')
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ElementTree, parse, Element, SubElement
from nltk.tokenize import word_tokenize
import string
from nltk.tree import ParentedTree as Tree
tree = ET.parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
glossary = {
'laptop': ['computer', 'device', 'machine', 'price', 'cost', 'macbook', 'mac', 'pc', 'speed', 'it', 'this', 'product'],
'display': ['monitor', 'screen', 'touchscreen'],
......@@ -113,10 +109,21 @@ def replace_feature_nps_tree(feature, parse_tree, np_trees):
assert parse_tree != modified_tree
return modified_tree
tree = parse('ABSA16_Laptops_Train_SB1_v2_with_parse_trees.xml')
reviews = tree.getroot()
n = len(reviews)
i = 0
prepped_opinions = 0
total_opinions = 0
prepared_opinions_count = 0
total_opinions_count = 0
prepared_counts = {
'positive': 0,
'neutral': 0,
'negative': 0
}
train_root = Element('data')
for review in reviews:
sentences = review[0]
......@@ -130,23 +137,52 @@ for review in reviews:
parse_tree_str = sentence.find('tree').text
parse_tree = Tree.fromstring(parse_tree_str)
nps = extract_extended_nouns(parse_tree)
opinion_trees = []
# attempt to identify opinion target in sentence
for opinion in opinions:
total_opinions += 1
total_opinions_count += 1
modified_tree = replace_feature_nps_tree(opinion[0], parse_tree, nps)
if modified_tree:
prepped_opinions += 1
print('---')
if modified_tree and prepared_counts[opinion[1]] < 500:
opinion_trees.append(modified_tree)
prepared_opinions_count += 1
prepared_counts[opinion[1]] += 1
print('review text:')
print(text)
print('')
print('tree:')
modified_tree.pretty_print()
print('modified text:')
print(' '.join(modified_tree.leaves()))
print('')
print('labelled opinion:')
print(opinion)
print('')
print('---')
# store in new train_tree
instance_node = SubElement(train_root, 'instance')
text_node = SubElement(instance_node, 'text')
text_node.text = text
opinion_node = SubElement(instance_node, 'opinion')
opinion_node.text = opinion[1]
opinion_tree_node = SubElement(instance_node, 'tree')
opinion_tree_node.text = str(modified_tree)
else:
pass
# print('---')
# print(text)
# print(nps)
# print(opinion)
i += 1
print('{}/{}'.format(i, n))
print('{}/{} opinions prepared'.format(prepped_opinions, total_opinions))
train_tree = ElementTree(train_root)
train_tree.write('ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml')
print('{}/{} opinions prepared'.format(prepared_opinions_count, total_opinions_count))
print(prepared_counts)
#########################
SocialSent Sentiment Data
#########################
This directory contains sentiment lexicons for the top 250 subreddits on Reddit (by comment count excluding non-English communities)
See http://nlp.stanford.edu/projects/socialsent for links to the accompanying paper, with details on the algorithm, seeds words, and data sources.
All files are .tsv's of the form:
<word> <mean_sentiment> <std_sentiment>
where mean_sentiment is the averaged inferred sentiment across bootstrap-sampled SentProp runs
and std_sentiment is the standard deviation of these samples.
SentProp was run with the following hyperparameters:
num nearest neighbors k=25
random walk beta=0.9
50 bootstrap samples of size 7
from nltk.tree import ParentedTree as Tree
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
class FeatureVector:
sentiment_lexicon = pd.read_csv('data/SocialSent/subreddit_sentiment_lexicons/technology.tsv', index_col=0, header=None, names=['mean', 'std'], sep='\t', error_bad_lines=False)
# in: sentence parse tree with labelled argument ARG
def __init__(self, tree, token_model):
self.vector = []
tokens = [token for token in tree.leaves() if token != 'ARG']
# target-independent features:
# words, punctuation using mean of Word2Vec vectors for tokens
self.vector.extend(np.mean(np.array([token_model.wv[token] for token in tokens]), axis=0))
# sentiment lexicon features
self.vector.extend(self.sentiment_scores(tokens))
# TODO: target-dependent features
def sentiment_scores(self, l):
pos = 0
neg = 0
for token in l:
if token in self.sentiment_lexicon.index:
mean, std = self.sentiment_lexicon.loc[token]
if mean < 0:
neg += abs(mean)
else:
pos += abs(mean)
return (pos / len(l), neg / len(l))
import xml.etree.ElementTree as ET
from nltk.tree import ParentedTree as Tree
from sklearn import svm
from feature_vector import FeatureVector
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from sklearn.metrics import confusion_matrix
import os
class SentimentAnalyzer:
expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG
token_model = Word2Vec.load('word2vec.model') if os.path.isfile('./word2vec.model') else None
def train_w2v_model(self, texts):
# path = get_tmpfile('word2vec.model')
self.token_model = Word2Vec(texts, size=100, window=5, min_count=1, workers=4)
self.token_model.save('word2vec.model')
def train_expr_clf(self, feature_vectors, targets):
print([fv.vector for fv in feature_vectors])
print(targets)
self.expr_clf.fit([fv.vector for fv in feature_vectors], targets)
def get_feature_vector(self, tree):
return FeatureVector(tree, self.token_model)
# in: sentence parse tree with labelled argument ARG
# out: true if sentence expresses sentiment towards ARG else false
def expresses_sentiment(self, feature_vectors):
return self.expr_clf.predict([fv.vector for fv in feature_vectors])
sa = SentimentAnalyzer()
train_tree = ET.parse('data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml')
instances = train_tree.getroot()
sa.train_w2v_model([Tree.fromstring(instance.find('tree').text).leaves() for instance in instances])
feature_vectors = [sa.get_feature_vector(Tree.fromstring(instance.find('tree').text)) for instance in instances]
targets = [instance.find('opinion').text for instance in instances]
sa.train_expr_clf(feature_vectors, targets)
test_tree = ET.parse('data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml')
test_instances = train_tree.getroot()
feature_vectors = [sa.get_feature_vector(Tree.fromstring(instance.find('tree').text)) for instance in test_instances]
targets = [instance.find('opinion').text for instance in test_instances]
pred = sa.expresses_sentiment(feature_vectors)
#print([(pred[i], targets[i]) for i in range(len(pred))])
cm = confusion_matrix(targets, pred, labels=['positive', 'neutral', 'negative'])
print(cm)
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Reviews>
<Review rid="en_BlueRibbonSushi_478218171">
<sentences>
<sentence id="en_BlueRibbonSushi_478218171:0">
<text>Yum!</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218171:1">
<text>Serves really good sushi.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218171:2">
<text>Not the biggest portions but adequate.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218171:3">
<text>Green Tea creme brulee is a must!</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218171:4">
<text>Don&apos;t leave the restaurant without it.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_BlueRibbonSushi_478218345">
<sentences>
<sentence id="en_BlueRibbonSushi_478218345:0">
<text>No Comparison</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218345:1">
<text>– I can&apos;t say enough about this place.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218345:2">
<text>It has great sushi and even better service.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218345:3">
<text>The entire staff was extremely accomodating and tended to my every need.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218345:4">
<text>I&apos;ve been to this restaurant over a dozen times with no complaints to date.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_SchoonerOrLater_477965690">
<sentences>
<sentence id="en_SchoonerOrLater_477965690:0">
<text>Snotty Attitude</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965690:1">
<text>– We were treated very rudely here one time for breakfast.</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965690:2">
<text>The owner is belligerent to guests that have a complaint.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_SchoonerOrLater_477965849">
<sentences>
<sentence id="en_SchoonerOrLater_477965849:0">
<text>Good food!</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965849:1">
<text>– We love breakfast food.</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965849:2">
<text>This is a great place to get a delicious meal.</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965849:3">
<text>We never had to wait more than 5 minutes.</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965849:4">
<text>The staff is pretty friendly.</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965849:5">
<text>The onion rings are great!</text>
</sentence>
<sentence id="en_SchoonerOrLater_477965849:6">
<text>They are not greasy or anything.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_PagodaRestaurant_478006817">
<sentences>
<sentence id="en_PagodaRestaurant_478006817:0">
<text>Overrated</text>
</sentence>
<sentence id="en_PagodaRestaurant_478006817:1">
<text>– I was highly disappointed in the food at Pagoda.</text>
</sentence>
<sentence id="en_PagodaRestaurant_478006817:2">
<text>The lemon chicken tasted like sticky sweet donuts and the honey walnut prawns, the few they actually give you.....were not good.</text>
</sentence>
<sentence id="en_PagodaRestaurant_478006817:3" OutOfScope="TRUE">
<text>The prices are outrageous, especially since the food was actually less satisfying than most neighborhood Chinese establishments.</text>
</sentence>
<sentence id="en_PagodaRestaurant_478006817:4">
<text>Nice ambience, but highly overrated place.</text>
</sentence>
<sentence id="en_PagodaRestaurant_478006817:5">
<text>I will not go back.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_ParkChaletGardenRestaurant_477778282">
<sentences>
<sentence id="en_ParkChaletGardenRestaurant_477778282:0">
<text>Worst Service I Ever Had</text>
</sentence>
<sentence id="en_ParkChaletGardenRestaurant_477778282:1">
<text>– A group of 5 of us went there for Sunday brunch and sat outside.</text>
</sentence>
<sentence id="en_ParkChaletGardenRestaurant_477778282:2">
<text>Everyone that sat in the back outside agreed that it was the worst service we had ever received.</text>
</sentence>
<sentence id="en_ParkChaletGardenRestaurant_477778282:3">
<text>Our waiter was non-existent and after our food finally arrived over an hour after we ordered, we were not given any water or utensils.</text>
</sentence>
<sentence id="en_ParkChaletGardenRestaurant_477778282:4">
<text>I complained to the manager, but he was not even apologetic.</text>
</sentence>
<sentence id="en_ParkChaletGardenRestaurant_477778282:5">
<text>I will never return again.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_MiopostoCaffe_478543071">
<sentences>
<sentence id="en_MiopostoCaffe_478543071:0">
<text>Fabulous Italian Food!</text>
</sentence>
<sentence id="en_MiopostoCaffe_478543071:1">
<text>– I highly recommend Mioposto.</text>
</sentence>
<sentence id="en_MiopostoCaffe_478543071:2">
<text>I am so happy to have a wonderful Italian restaurant in my neighborhood.</text>
</sentence>
<sentence id="en_MiopostoCaffe_478543071:3">
<text>The wine list is wonderful and the food reminds me of my recent trip to Italy.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_Murphy's_478075040">
<sentences>
<sentence id="en_Murphy's_478075040:0">
<text>I love this restaurant</text>
</sentence>
<sentence id="en_Murphy's_478075040:1">
<text>– I will never forget the amazing meal, service, and ambiance I experience at this restaurant.</text>
</sentence>
<sentence id="en_Murphy's_478075040:2">
<text>It was absolutely amazing.</text>
</sentence>
<sentence id="en_Murphy's_478075040:3">
<text>The wine list is incredible and extensive and diverse, the food is all incredible and the staff was all very nice, good at their jobs and cultured.</text>
</sentence>
<sentence id="en_Murphy's_478075040:4">
<text>I have not a bad thing to say about this place.</text>
</sentence>
<sentence id="en_Murphy's_478075040:5">
<text>AMAZING.</text>
</sentence>
<sentence id="en_Murphy's_478075040:6">
<text>I cannot wait to go back again this coming weekend!</text>
</sentence>
</sentences>
</Review>
<Review rid="en_OpenSesame_477970939">
<sentences>
<sentence id="en_OpenSesame_477970939:0">
<text>Mmm... good!</text>
</sentence>
<sentence id="en_OpenSesame_477970939:1">
<text>– Went there last night with a friend.</text>
</sentence>
<sentence id="en_OpenSesame_477970939:2">
<text>She had heard from a co-worker about this place.</text>
</sentence>
<sentence id="en_OpenSesame_477970939:3">
<text>The food was great!</text>
</sentence>
<sentence id="en_OpenSesame_477970939:4">
<text>It&apos;s *very* reasonably priced, esp for the quality of the food.</text>
</sentence>
<sentence id="en_OpenSesame_477970939:5">
<text>I had the Kafta plate and it was perfect.</text>
</sentence>
<sentence id="en_OpenSesame_477970939:6">
<text>We&apos;re going back. :D</text>
</sentence>
</sentences>
</Review>
<Review rid="en_Sage_480875505">
<sentences>
<sentence id="en_Sage_480875505:0">
<text>Finally a meal that you will remember for a long time!</text>
</sentence>
<sentence id="en_Sage_480875505:1">
<text>– In a age of incremental cost cutting in restaurants, its nice to see a place that bucks that trend, and just plain delivers high quality food and good service, period.</text>
</sentence>
<sentence id="en_Sage_480875505:2">
<text>This is the place to relax and enjoy the finest quality food the industry can offer.</text>
</sentence>
<sentence id="en_Sage_480875505:3">
<text>Caution - its real food for people who love the best.</text>
</sentence>
<sentence id="en_Sage_480875505:4" OutOfScope="TRUE">
<text>Some of the food is clearly oriented toward foodies (like me) so if your not a foodie (nothing wrong with that)- head over to McDonalds or some Ruby Tuesday and have at it!</text>
</sentence>
<sentence id="en_Sage_480875505:5" OutOfScope="TRUE">
<text>If your looking for real quality (not the BS that is seen on yelp ratings - which allegedly recently got in trouble for &quot;adjusting&quot; ratings), you need to get in here!</text>
</sentence>
</sentences>
</Review>
<Review rid="en_BlueRibbonSushi_478218900">
<sentences>
<sentence id="en_BlueRibbonSushi_478218900:0">
<text>Very Disappointing</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218900:1">
<text>– I took my parents here for their anniversary-very very disappointed!!!</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218900:2">
<text>I liked the atmosphere very much but the food was not worth the price.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218900:3">
<text>I may not be a sushi guru but I can tell you that the food here is just okay and that there is not much else to it.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218900:4">
<text>Rice is too dry, tuna wasn&apos;t so fresh either.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218900:5">
<text>Nothing really came across as outstanding.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218900:6">
<text>Very disappointed.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_BlueRibbonSushi_478219453">
<sentences>
<sentence id="en_BlueRibbonSushi_478219453:0">
<text>I have eaten here three times and have found the quality and variety of the fish to be excellent.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478219453:1">
<text>However, the value and service are both severely lacking.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478219453:2" OutOfScope="TRUE">
<text>Portions contain less fish than one would expect, particularly since prices are in line with the best sushi places in NYC (Masa excluded, of course).</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478219453:3">
<text>Furthermore, while the fish is unquestionably fresh, rolls tend to be inexplicably bland.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478219453:4">
<text>The service ranges from mediocre to offensive.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478219453:5">
<text>On a recent trip, our waiter was extremely dismissive, while no less than three staff members waited hand-and-foot on a pair of Japanese girls seated nearby.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478219453:6">
<text>We were then charged for their most expensive sake ($20+ per serving) when we in fact drank a sake of less than half that price.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478219453:7">
<text>But Im sure this was just an honest mistake...</text>
</sentence>
</sentences>
</Review>
<Review rid="en_BlueRibbonSushi_479929856">
<sentences>
<sentence id="en_BlueRibbonSushi_479929856:0">
<text>Freshest sushi – I love this restaurant.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_479929856:1">
<text>They pay such detail to everything from miso soup to complex rolls.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_479929856:2">
<text>The sashimi was the freshest and most tender I have ever tasted.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_479929856:3">
<text>Their apps are all delicious.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_479929856:4">
<text>The only drawback is that this place is really expensive and the portions are on the small side.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_479929856:5">
<text>But the space is small and lovely, and the service is helpful.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_479929856:6">
<text>You are bound to have a very charming time.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_BlueRibbonSushi_478218901">
<sentences>
<sentence id="en_BlueRibbonSushi_478218901:0">
<text>Not recommanded!!!</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218901:1">
<text>– The food was not great &amp; the waiters were rude.</text>
</sentence>
<sentence id="en_BlueRibbonSushi_478218901:2">
<text>It is not worth going at all and spend your money there!!!</text>
</sentence>
</sentences>
</Review>
<Review rid="en_MercedesRestaurant_478010605">
<sentences>
<sentence id="en_MercedesRestaurant_478010605:0">
<text>great service</text>
</sentence>
<sentence id="en_MercedesRestaurant_478010605:1">
<text>– Pretty cheap for sit down Mexican AND downtown.</text>
</sentence>
<sentence id="en_MercedesRestaurant_478010605:2">
<text>my service was stellar!</text>
</sentence>
<sentence id="en_MercedesRestaurant_478010605:3">
<text>the bus boy even spotted that my table was shaking a stabilized it for me.</text>
</sentence>
<sentence id="en_MercedesRestaurant_478010605:4">
<text>food was fine, with a some little-tastier-than-normal salsa.</text>
</sentence>
<sentence id="en_MercedesRestaurant_478010605:5">
<text>If you&apos;re in the area you shouldn&apos;t be disappointed.</text>
</sentence>
</sentences>
</Review>
<Review rid="en_MercedesRestaurant_480852899">
<sentences>
<sentence id="en_MercedesRestaurant_480852899:0">
<text>Ok.... – Maybe I went in on someone&apos;s bad day....</text>
</sentence>
<sentence id="en_MercedesRestaurant_480852899:1">
<text>the food was great, the margaritas too but the waitress was too busy being nice to her other larger party than to take better care of my friend and me.</text>
</sentence>
<sentence id="en_MercedesRestaurant_480852899:2">
<text>Took forever to get our order taken, water refills were too much to ask for and the only time she was fast was when we asked for our bill when we could get her attention.</text>
</sentence>
<sentence id="en_MercedesRestaurant_480852899:3">
<text>I don&apos;t know if I&apos;ll be back....</text>
</sentence>
</sentences>
</Review>
<Review rid="en_Patsy'sPizzeria_478231878">
<sentences>
<sentence id="en_Patsy'sPizzeria_478231878:0">
<text>Mama Mia – I live in the neighborhood and feel lucky to live by such a great pizza place.</text>
</sentence>