Commit 43892e17 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Started implementing bert in server

parent 605365c2
...@@ -12,7 +12,7 @@ import shap ...@@ -12,7 +12,7 @@ import shap
semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml' semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml' semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test.xml' amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml'
trained_model_path = 'semeval_2014_2.pt' trained_model_path = 'semeval_2014_2.pt'
BATCH_SIZE = 32 BATCH_SIZE = 32
...@@ -26,6 +26,12 @@ def loss(outputs, labels): ...@@ -26,6 +26,12 @@ def loss(outputs, labels):
class BertAnalyzer: class BertAnalyzer:
@staticmethod
def default():
sa = BertAnalyzer()
sa.load_saved('semeval_2014.pt')
return sa
def load_saved(self, path): def load_saved(self, path):
self.net = TDBertNet(len(polarity_indices)) self.net = TDBertNet(len(polarity_indices))
self.net.load_state_dict(torch.load(path)) self.net.load_state_dict(torch.load(path))
...@@ -93,7 +99,7 @@ class BertAnalyzer: ...@@ -93,7 +99,7 @@ class BertAnalyzer:
f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro') f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro')
print('macro F1:', f1) print('macro F1:', f1)
def analyze_sentence(self, text, char_from, char_to): def get_sentiment_polarity(self, text, char_from, char_to):
instance = Instance(text, char_from, char_to) instance = Instance(text, char_from, char_to)
tokens, tg_from, tg_to = instance.get() tokens, tg_from, tg_to = instance.get()
text, target_indices = instance.to_tensor() text, target_indices = instance.to_tensor()
...@@ -116,10 +122,18 @@ class BertAnalyzer: ...@@ -116,10 +122,18 @@ class BertAnalyzer:
# ax.set_xticklabels(tokens, rotation=45, rotation_mode='anchor', ha='right') # ax.set_xticklabels(tokens, rotation=45, rotation_mode='anchor', ha='right')
# plt.show() # plt.show()
_, pred = torch.max(outputs.data, 1) val, pred = torch.max(outputs.data, 1)
return pred if pred == 0:
# positive
return val
sentiment_analyzer = BertAnalyzer() elif pred == 1:
sentiment_analyzer.load_saved('semeval_2014.pt') # negative
print(sentiment_analyzer.analyze_sentence("Well built laptop with win7.", 11, 17)) return -val
\ No newline at end of file else:
# neutral or conflicted
return 0
sentiment_analyzer = BertAnalyzer.default()
sentiment_analyzer.evaluate(semeval_2014_test_path)
sentiment_analyzer.evaluate(amazon_test_path)
...@@ -11,6 +11,7 @@ from sklearn.feature_extraction.text import CountVectorizer ...@@ -11,6 +11,7 @@ from sklearn.feature_extraction.text import CountVectorizer
import os import os
import math import math
def resample_data(instances, labels): def resample_data(instances, labels):
label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels} label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels}
max_n_instances = max([len(v) for v in label_instances.values()]) max_n_instances = max([len(v) for v in label_instances.values()])
...@@ -22,6 +23,7 @@ def resample_data(instances, labels): ...@@ -22,6 +23,7 @@ def resample_data(instances, labels):
print(len(resampled_data)) print(len(resampled_data))
return resampled_data return resampled_data
class SentimentAnalyzer: class SentimentAnalyzer:
expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG
...@@ -39,10 +41,11 @@ class SentimentAnalyzer: ...@@ -39,10 +41,11 @@ class SentimentAnalyzer:
def expresses_sentiment(self, instances): def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances]) return self.expr_clf.predict([instance.vector for instance in instances])
semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml' semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml'
semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml' semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
amazon_train_path = 'data/Amazon/amazon_camera_train.xml' amazon_train_path = 'data/Amazon/amazon_camera_train.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml' amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml'
semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml' semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' # semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' #
tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml' tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml'
...@@ -56,7 +59,7 @@ sa = SentimentAnalyzer() ...@@ -56,7 +59,7 @@ sa = SentimentAnalyzer()
train_tree = ET.parse(train_path) train_tree = ET.parse(train_path)
train_instances = [Instance(instance) for instance in train_tree.getroot()] train_instances = [Instance(instance) for instance in train_tree.getroot()]
train_instances = resample_data(train_instances, labels) # train_instances = resample_data(train_instances, labels)
# create and train vectorizer model # create and train vectorizer model
vec = Vectorizer(train_instances) vec = Vectorizer(train_instances)
......
...@@ -17,7 +17,7 @@ class Vectorizer: ...@@ -17,7 +17,7 @@ class Vectorizer:
self.transformer = TfidfTransformer() self.transformer = TfidfTransformer()
# indep features: # indep features:
self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2)) self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,5))
texts = [instance.text for instance in train_instances] texts = [instance.text for instance in train_instances]
train_bow_vectors = self.bow_vectorizer.fit_transform(texts).toarray() train_bow_vectors = self.bow_vectorizer.fit_transform(texts).toarray()
train_sent_vectors = [self.sentiment_scores(instance) for instance in train_instances] train_sent_vectors = [self.sentiment_scores(instance) for instance in train_instances]
......
...@@ -5,11 +5,12 @@ from anytree import PostOrderIter ...@@ -5,11 +5,12 @@ from anytree import PostOrderIter
import pickle import pickle
from argument import * from argument import *
from functools import reduce from functools import reduce
from SA.bert_analyzer import BertAnalyzer
class Agent: class Agent:
sentiment_threshold = 0.95 sentiment_threshold = 0.95
review_tokenizer = ReviewTokenizer() review_tokenizer = ReviewTokenizer()
bert_analyzer = BertAnalyzer.default()
def __init__(self): def __init__(self):
# load classifier # load classifier
...@@ -27,7 +28,7 @@ class Agent: ...@@ -27,7 +28,7 @@ class Agent:
return phrases return phrases
# analyze sentiment # analyze sentiment
def get_sentiment(self, phrase): def get_bayes_sentiment(self, phrase):
# get classification # get classification
tokens = self.review_tokenizer.tokenize_review(phrase) tokens = self.review_tokenizer.tokenize_review(phrase)
prob_classification = self.classifier.prob_classify(dict([token, True] for token in tokens)) prob_classification = self.classifier.prob_classify(dict([token, True] for token in tokens))
...@@ -35,6 +36,9 @@ class Agent: ...@@ -35,6 +36,9 @@ class Agent:
strength = (prob_classification.prob(classification) - 0.5) * 2 strength = (prob_classification.prob(classification) - 0.5) * 2
return strength if classification == '+' else -strength return strength if classification == '+' else -strength
def get_bert_sentiment(self, text, char_from, char_to):
return self.bert_analyzer.get_sentiment_polarity(text, char_from, char_to)
# remove all ancestors of node in list l # remove all ancestors of node in list l
def remove_ancestors(self, node, l): def remove_ancestors(self, node, l):
if node.parent != None: if node.parent != None:
...@@ -51,8 +55,9 @@ class Agent: ...@@ -51,8 +55,9 @@ class Agent:
while len(arguments) > 0: while len(arguments) > 0:
f = arguments.pop(0) f = arguments.pop(0)
for word in glossary[f]: for word in glossary[f]:
if word in phrase: matches = [(f, m.start(), m.end()) for m in re.finditer(word, phrase)]
argument_matches.append(f) if matches:
argument_matches += matches
self.remove_ancestors(f, arguments) self.remove_ancestors(f, arguments)
break break
return argument_matches return argument_matches
...@@ -61,17 +66,16 @@ class Agent: ...@@ -61,17 +66,16 @@ class Agent:
votes = {} votes = {}
vote_phrases = {} vote_phrases = {}
for phrase in phrases: for phrase in phrases:
arguments = self.get_arguments(phrase) for argument, start, end in self.get_arguments(phrase):
sentiment = self.get_sentiment(phrase) sentiment = self.get_bayes_sentiment(phrase) # self.get_bert_sentiment(phrase, start, end)
if abs(sentiment) > self.sentiment_threshold: if abs(sentiment) > self.sentiment_threshold:
for argument in arguments:
if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)): if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)):
votes[argument] = sentiment # what if there's two phrases with same argument? votes[argument] = sentiment # what if there's two phrases with same argument?
vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment} vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment}
# normalize votes to 1 (+) or -1 (-) # normalize votes to 1 (+) or -1 (-)
for argument in votes: for argument in votes:
votes[argument] = 1 if votes[argument] > 0 else -1 votes[argument] = 1 if votes[argument] > 0 else -1
return (votes, vote_phrases) return votes, vote_phrases
# augment votes (Definition 4.3) obtained for a single critic # augment votes (Definition 4.3) obtained for a single critic
def augment_votes(self, votes): def augment_votes(self, votes):
......
import pandas as pd import pandas as pd
class DataLoader:
class DataLoader:
data_location = 'camera_prepared_data.tsv' data_location = 'camera_prepared_data.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
......
...@@ -4,7 +4,7 @@ import jsonpickle ...@@ -4,7 +4,7 @@ import jsonpickle
from django.views.decorators.csrf import csrf_exempt from django.views.decorators.csrf import csrf_exempt
import sys import sys
sys.path.append('/Users/joeloksanen/individual_project/ADA') sys.path.append('/home/joel/individual_project/ADA')
from dataloader import DataLoader from dataloader import DataLoader
from communicator import Communicator from communicator import Communicator
......
...@@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f' ...@@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f'
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True DEBUG = True
ALLOWED_HOSTS = ['192.168.0.13', '146.169.222.109', '146.169.218.37'] ALLOWED_HOSTS = ['192.168.1.104']
# Application definition # Application definition
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment