Commit 43892e17 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Started implementing bert in server

parent 605365c2
......@@ -12,7 +12,7 @@ import shap
semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test.xml'
amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml'
trained_model_path = 'semeval_2014_2.pt'
BATCH_SIZE = 32
......@@ -26,6 +26,12 @@ def loss(outputs, labels):
class BertAnalyzer:
@staticmethod
def default():
sa = BertAnalyzer()
sa.load_saved('semeval_2014.pt')
return sa
def load_saved(self, path):
self.net = TDBertNet(len(polarity_indices))
self.net.load_state_dict(torch.load(path))
......@@ -93,7 +99,7 @@ class BertAnalyzer:
f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro')
print('macro F1:', f1)
def analyze_sentence(self, text, char_from, char_to):
def get_sentiment_polarity(self, text, char_from, char_to):
instance = Instance(text, char_from, char_to)
tokens, tg_from, tg_to = instance.get()
text, target_indices = instance.to_tensor()
......@@ -116,10 +122,18 @@ class BertAnalyzer:
# ax.set_xticklabels(tokens, rotation=45, rotation_mode='anchor', ha='right')
# plt.show()
_, pred = torch.max(outputs.data, 1)
return pred
sentiment_analyzer = BertAnalyzer()
sentiment_analyzer.load_saved('semeval_2014.pt')
print(sentiment_analyzer.analyze_sentence("Well built laptop with win7.", 11, 17))
\ No newline at end of file
val, pred = torch.max(outputs.data, 1)
if pred == 0:
# positive
return val
elif pred == 1:
# negative
return -val
else:
# neutral or conflicted
return 0
sentiment_analyzer = BertAnalyzer.default()
sentiment_analyzer.evaluate(semeval_2014_test_path)
sentiment_analyzer.evaluate(amazon_test_path)
......@@ -11,6 +11,7 @@ from sklearn.feature_extraction.text import CountVectorizer
import os
import math
def resample_data(instances, labels):
label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels}
max_n_instances = max([len(v) for v in label_instances.values()])
......@@ -22,6 +23,7 @@ def resample_data(instances, labels):
print(len(resampled_data))
return resampled_data
class SentimentAnalyzer:
expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG
......@@ -39,6 +41,7 @@ class SentimentAnalyzer:
def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances])
semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml'
semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
amazon_train_path = 'data/Amazon/amazon_camera_train.xml'
......@@ -56,7 +59,7 @@ sa = SentimentAnalyzer()
train_tree = ET.parse(train_path)
train_instances = [Instance(instance) for instance in train_tree.getroot()]
train_instances = resample_data(train_instances, labels)
# train_instances = resample_data(train_instances, labels)
# create and train vectorizer model
vec = Vectorizer(train_instances)
......
......@@ -17,7 +17,7 @@ class Vectorizer:
self.transformer = TfidfTransformer()
# indep features:
self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,5))
texts = [instance.text for instance in train_instances]
train_bow_vectors = self.bow_vectorizer.fit_transform(texts).toarray()
train_sent_vectors = [self.sentiment_scores(instance) for instance in train_instances]
......
......@@ -5,11 +5,12 @@ from anytree import PostOrderIter
import pickle
from argument import *
from functools import reduce
from SA.bert_analyzer import BertAnalyzer
class Agent:
sentiment_threshold = 0.95
review_tokenizer = ReviewTokenizer()
bert_analyzer = BertAnalyzer.default()
def __init__(self):
# load classifier
......@@ -27,7 +28,7 @@ class Agent:
return phrases
# analyze sentiment
def get_sentiment(self, phrase):
def get_bayes_sentiment(self, phrase):
# get classification
tokens = self.review_tokenizer.tokenize_review(phrase)
prob_classification = self.classifier.prob_classify(dict([token, True] for token in tokens))
......@@ -35,6 +36,9 @@ class Agent:
strength = (prob_classification.prob(classification) - 0.5) * 2
return strength if classification == '+' else -strength
def get_bert_sentiment(self, text, char_from, char_to):
return self.bert_analyzer.get_sentiment_polarity(text, char_from, char_to)
# remove all ancestors of node in list l
def remove_ancestors(self, node, l):
if node.parent != None:
......@@ -51,8 +55,9 @@ class Agent:
while len(arguments) > 0:
f = arguments.pop(0)
for word in glossary[f]:
if word in phrase:
argument_matches.append(f)
matches = [(f, m.start(), m.end()) for m in re.finditer(word, phrase)]
if matches:
argument_matches += matches
self.remove_ancestors(f, arguments)
break
return argument_matches
......@@ -61,17 +66,16 @@ class Agent:
votes = {}
vote_phrases = {}
for phrase in phrases:
arguments = self.get_arguments(phrase)
sentiment = self.get_sentiment(phrase)
for argument, start, end in self.get_arguments(phrase):
sentiment = self.get_bayes_sentiment(phrase) # self.get_bert_sentiment(phrase, start, end)
if abs(sentiment) > self.sentiment_threshold:
for argument in arguments:
if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)):
votes[argument] = sentiment # what if there's two phrases with same argument?
vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment}
# normalize votes to 1 (+) or -1 (-)
for argument in votes:
votes[argument] = 1 if votes[argument] > 0 else -1
return (votes, vote_phrases)
return votes, vote_phrases
# augment votes (Definition 4.3) obtained for a single critic
def augment_votes(self, votes):
......
import pandas as pd
class DataLoader:
class DataLoader:
data_location = 'camera_prepared_data.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
......
......@@ -4,7 +4,7 @@ import jsonpickle
from django.views.decorators.csrf import csrf_exempt
import sys
sys.path.append('/Users/joeloksanen/individual_project/ADA')
sys.path.append('/home/joel/individual_project/ADA')
from dataloader import DataLoader
from communicator import Communicator
......
......@@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = ['192.168.0.13', '146.169.222.109', '146.169.218.37']
ALLOWED_HOSTS = ['192.168.1.104']
# Application definition
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment