Skip to content
Snippets Groups Projects
Commit 43892e17 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Started implementing bert in server

parent 605365c2
No related branches found
No related tags found
No related merge requests found
...@@ -12,7 +12,7 @@ import shap ...@@ -12,7 +12,7 @@ import shap
semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml' semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml' semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test.xml' amazon_test_path = 'data/Amazon/annotated_amazon_laptop_reviews.xml'
trained_model_path = 'semeval_2014_2.pt' trained_model_path = 'semeval_2014_2.pt'
BATCH_SIZE = 32 BATCH_SIZE = 32
...@@ -26,6 +26,12 @@ def loss(outputs, labels): ...@@ -26,6 +26,12 @@ def loss(outputs, labels):
class BertAnalyzer: class BertAnalyzer:
@staticmethod
def default():
sa = BertAnalyzer()
sa.load_saved('semeval_2014.pt')
return sa
def load_saved(self, path): def load_saved(self, path):
self.net = TDBertNet(len(polarity_indices)) self.net = TDBertNet(len(polarity_indices))
self.net.load_state_dict(torch.load(path)) self.net.load_state_dict(torch.load(path))
...@@ -93,7 +99,7 @@ class BertAnalyzer: ...@@ -93,7 +99,7 @@ class BertAnalyzer:
f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro') f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro')
print('macro F1:', f1) print('macro F1:', f1)
def analyze_sentence(self, text, char_from, char_to): def get_sentiment_polarity(self, text, char_from, char_to):
instance = Instance(text, char_from, char_to) instance = Instance(text, char_from, char_to)
tokens, tg_from, tg_to = instance.get() tokens, tg_from, tg_to = instance.get()
text, target_indices = instance.to_tensor() text, target_indices = instance.to_tensor()
...@@ -116,10 +122,18 @@ class BertAnalyzer: ...@@ -116,10 +122,18 @@ class BertAnalyzer:
# ax.set_xticklabels(tokens, rotation=45, rotation_mode='anchor', ha='right') # ax.set_xticklabels(tokens, rotation=45, rotation_mode='anchor', ha='right')
# plt.show() # plt.show()
_, pred = torch.max(outputs.data, 1) val, pred = torch.max(outputs.data, 1)
return pred if pred == 0:
# positive
return val
sentiment_analyzer = BertAnalyzer() elif pred == 1:
sentiment_analyzer.load_saved('semeval_2014.pt') # negative
print(sentiment_analyzer.analyze_sentence("Well built laptop with win7.", 11, 17)) return -val
\ No newline at end of file else:
# neutral or conflicted
return 0
sentiment_analyzer = BertAnalyzer.default()
sentiment_analyzer.evaluate(semeval_2014_test_path)
sentiment_analyzer.evaluate(amazon_test_path)
...@@ -11,6 +11,7 @@ from sklearn.feature_extraction.text import CountVectorizer ...@@ -11,6 +11,7 @@ from sklearn.feature_extraction.text import CountVectorizer
import os import os
import math import math
def resample_data(instances, labels): def resample_data(instances, labels):
label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels} label_instances = {label: [instance for instance in instances if instance.opinion == label] for label in labels}
max_n_instances = max([len(v) for v in label_instances.values()]) max_n_instances = max([len(v) for v in label_instances.values()])
...@@ -22,6 +23,7 @@ def resample_data(instances, labels): ...@@ -22,6 +23,7 @@ def resample_data(instances, labels):
print(len(resampled_data)) print(len(resampled_data))
return resampled_data return resampled_data
class SentimentAnalyzer: class SentimentAnalyzer:
expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG expr_clf = svm.SVC() # determines if sentence expresses sentiment towards ARG
...@@ -39,10 +41,11 @@ class SentimentAnalyzer: ...@@ -39,10 +41,11 @@ class SentimentAnalyzer:
def expresses_sentiment(self, instances): def expresses_sentiment(self, instances):
return self.expr_clf.predict([instance.vector for instance in instances]) return self.expr_clf.predict([instance.vector for instance in instances])
semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml' semeval_2014_train_path = 'data/SemEval-2014/SemEval_2014_Laptop_Train_with_labelled_parse_trees.xml'
semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml' semeval_2014_test_path = 'data/SemEval-2014/SemEval_2014_Laptop_Test_with_labelled_parse_trees.xml'
amazon_train_path = 'data/Amazon/amazon_camera_train.xml' amazon_train_path = 'data/Amazon/amazon_camera_train.xml'
amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml' amazon_test_path = 'data/Amazon/amazon_camera_test2.xml' # 'data/Amazon/prepared_amazon_camera_reviews.xml'
semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml' semeval_train_path = 'data/SemEval-2016/ABSA16_Laptops_Train_SB1_v2_with_labelled_parse_trees.xml'
semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' # semeval_test_path = 'data/SemEval-2016/ABSA16_Laptops_Test_with_labelled_parse_trees.xml' #
tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml' tweet_train_path = 'data/acl-14-short-data/tweet_train_with_labelled_parse_trees.xml'
...@@ -56,7 +59,7 @@ sa = SentimentAnalyzer() ...@@ -56,7 +59,7 @@ sa = SentimentAnalyzer()
train_tree = ET.parse(train_path) train_tree = ET.parse(train_path)
train_instances = [Instance(instance) for instance in train_tree.getroot()] train_instances = [Instance(instance) for instance in train_tree.getroot()]
train_instances = resample_data(train_instances, labels) # train_instances = resample_data(train_instances, labels)
# create and train vectorizer model # create and train vectorizer model
vec = Vectorizer(train_instances) vec = Vectorizer(train_instances)
......
...@@ -17,7 +17,7 @@ class Vectorizer: ...@@ -17,7 +17,7 @@ class Vectorizer:
self.transformer = TfidfTransformer() self.transformer = TfidfTransformer()
# indep features: # indep features:
self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2)) self.bow_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,5))
texts = [instance.text for instance in train_instances] texts = [instance.text for instance in train_instances]
train_bow_vectors = self.bow_vectorizer.fit_transform(texts).toarray() train_bow_vectors = self.bow_vectorizer.fit_transform(texts).toarray()
train_sent_vectors = [self.sentiment_scores(instance) for instance in train_instances] train_sent_vectors = [self.sentiment_scores(instance) for instance in train_instances]
......
...@@ -5,11 +5,12 @@ from anytree import PostOrderIter ...@@ -5,11 +5,12 @@ from anytree import PostOrderIter
import pickle import pickle
from argument import * from argument import *
from functools import reduce from functools import reduce
from SA.bert_analyzer import BertAnalyzer
class Agent: class Agent:
sentiment_threshold = 0.95 sentiment_threshold = 0.95
review_tokenizer = ReviewTokenizer() review_tokenizer = ReviewTokenizer()
bert_analyzer = BertAnalyzer.default()
def __init__(self): def __init__(self):
# load classifier # load classifier
...@@ -27,7 +28,7 @@ class Agent: ...@@ -27,7 +28,7 @@ class Agent:
return phrases return phrases
# analyze sentiment # analyze sentiment
def get_sentiment(self, phrase): def get_bayes_sentiment(self, phrase):
# get classification # get classification
tokens = self.review_tokenizer.tokenize_review(phrase) tokens = self.review_tokenizer.tokenize_review(phrase)
prob_classification = self.classifier.prob_classify(dict([token, True] for token in tokens)) prob_classification = self.classifier.prob_classify(dict([token, True] for token in tokens))
...@@ -35,6 +36,9 @@ class Agent: ...@@ -35,6 +36,9 @@ class Agent:
strength = (prob_classification.prob(classification) - 0.5) * 2 strength = (prob_classification.prob(classification) - 0.5) * 2
return strength if classification == '+' else -strength return strength if classification == '+' else -strength
def get_bert_sentiment(self, text, char_from, char_to):
return self.bert_analyzer.get_sentiment_polarity(text, char_from, char_to)
# remove all ancestors of node in list l # remove all ancestors of node in list l
def remove_ancestors(self, node, l): def remove_ancestors(self, node, l):
if node.parent != None: if node.parent != None:
...@@ -51,8 +55,9 @@ class Agent: ...@@ -51,8 +55,9 @@ class Agent:
while len(arguments) > 0: while len(arguments) > 0:
f = arguments.pop(0) f = arguments.pop(0)
for word in glossary[f]: for word in glossary[f]:
if word in phrase: matches = [(f, m.start(), m.end()) for m in re.finditer(word, phrase)]
argument_matches.append(f) if matches:
argument_matches += matches
self.remove_ancestors(f, arguments) self.remove_ancestors(f, arguments)
break break
return argument_matches return argument_matches
...@@ -61,17 +66,16 @@ class Agent: ...@@ -61,17 +66,16 @@ class Agent:
votes = {} votes = {}
vote_phrases = {} vote_phrases = {}
for phrase in phrases: for phrase in phrases:
arguments = self.get_arguments(phrase) for argument, start, end in self.get_arguments(phrase):
sentiment = self.get_sentiment(phrase) sentiment = self.get_bayes_sentiment(phrase) # self.get_bert_sentiment(phrase, start, end)
if abs(sentiment) > self.sentiment_threshold: if abs(sentiment) > self.sentiment_threshold:
for argument in arguments:
if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)): if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)):
votes[argument] = sentiment # what if there's two phrases with same argument? votes[argument] = sentiment # what if there's two phrases with same argument?
vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment} vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment}
# normalize votes to 1 (+) or -1 (-) # normalize votes to 1 (+) or -1 (-)
for argument in votes: for argument in votes:
votes[argument] = 1 if votes[argument] > 0 else -1 votes[argument] = 1 if votes[argument] > 0 else -1
return (votes, vote_phrases) return votes, vote_phrases
# augment votes (Definition 4.3) obtained for a single critic # augment votes (Definition 4.3) obtained for a single critic
def augment_votes(self, votes): def augment_votes(self, votes):
......
import pandas as pd import pandas as pd
class DataLoader:
class DataLoader:
data_location = 'camera_prepared_data.tsv' data_location = 'camera_prepared_data.tsv'
reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False) reviews = pd.read_csv(data_location, sep='\t', error_bad_lines=False)
......
...@@ -4,7 +4,7 @@ import jsonpickle ...@@ -4,7 +4,7 @@ import jsonpickle
from django.views.decorators.csrf import csrf_exempt from django.views.decorators.csrf import csrf_exempt
import sys import sys
sys.path.append('/Users/joeloksanen/individual_project/ADA') sys.path.append('/home/joel/individual_project/ADA')
from dataloader import DataLoader from dataloader import DataLoader
from communicator import Communicator from communicator import Communicator
......
...@@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f' ...@@ -25,7 +25,7 @@ SECRET_KEY = 'z)tj_b=**v@b5-l6s!$*+_0=nzmor8dc#y$-%4%45kt8e8q@-f'
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True DEBUG = True
ALLOWED_HOSTS = ['192.168.0.13', '146.169.222.109', '146.169.218.37'] ALLOWED_HOSTS = ['192.168.1.104']
# Application definition # Application definition
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment