Skip to content
Snippets Groups Projects
Commit 52741466 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented target-dependent BERT which achieves results equivalent to state...

Implemented target-dependent BERT which achieves results equivalent to state of the art reported results.
parent 7718ad0b
No related branches found
No related tags found
No related merge requests found
.idea
*.pt
\ No newline at end of file
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tdbertnet import TDBertNet
from bert_dataset import BertDataset, polarity_indices, generate_batch
import time
import numpy as np
from sklearn import metrics
semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
trained_model_path = 'semeval_2014.pt'
BATCH_SIZE = 32
MAX_EPOCHS = 6
LEARNING_RATE = 0.00002
loss_criterion = nn.CrossEntropyLoss()
def loss(outputs, labels):
return loss_criterion(outputs, labels)
class BertAnalyzer:
def load_saved(self):
self.net = TDBertNet(len(polarity_indices))
self.net.load_state_dict(torch.load(trained_model_path))
self.net.eval()
def train(self):
train_data = BertDataset(semeval_2014_train_path)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch)
self.net = TDBertNet(len(polarity_indices))
optimiser = optim.Adam(net.parameters(), lr=LEARNING_RATE)
start = time.time()
for epoch in range(MAX_EPOCHS):
batch_loss = 0.0
for i, (texts, target_indices, labels) in enumerate(train_loader):
# zero param gradients
optimiser.zero_grad()
# forward pass
outputs = self.net(texts, target_indices)
# backward pass
l = loss(outputs, labels)
l.backward()
# optimise
optimiser.step()
# print interim stats every 10 batches
batch_loss += l.item()
if i % 10 == 9:
print('epoch:', epoch + 1, '-- batch:', i + 1, '-- avg loss:', batch_loss / 10)
batch_loss = 0.0
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(net.state_dict(), trained_model_path)
def evaluate(self):
test_data = BertDataset(semeval_2014_test_path)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_batch)
predicted = []
truths = []
with torch.no_grad():
for (texts, target_indices, labels) in test_loader:
outputs = self.net(texts, target_indices)
_, pred = torch.max(outputs.data, 1)
predicted += pred.tolist()
truths += labels.tolist()
correct = (np.array(predicted) == np.array(truths))
accuracy = correct.sum() / correct.size
print('accuracy:', accuracy)
cm = metrics.confusion_matrix(truths, predicted, labels=range(len(polarity_indices)))
print('confusion matrix:')
print(cm)
f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro')
print('macro F1:', f1)
sentiment_analyzer = BertAnalyzer()
sentiment_analyzer.load_saved()
sentiment_analyzer.evaluate()
\ No newline at end of file
import torch
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET
from transformers import *
from tdbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
import re
MAX_SEQ_LEN = 128
polarity_indices = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
def generate_batch(batch):
texts = tokenizer.batch_encode_plus([entry['tokens'] for entry in batch], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
max_tg_len = max(entry['to'] - entry['from'] for entry in batch)
target_indices = torch.tensor([[[min(t, entry['to'])] * HIDDEN_OUTPUT_FEATURES
for t in range(entry['from'], entry['from'] + max_tg_len + 1)]
for entry in batch])
polarity_labels = torch.tensor([entry['polarity'] for entry in batch])
return texts, target_indices, polarity_labels
def token_for_char(char_idx, text, tokens):
compressed_idx = len(re.sub(r'\s+', '', text)[:char_idx+1]) - 1
token_idx = -1
while compressed_idx >= 0:
token_idx += 1
compressed_idx -= len(tokens[token_idx].replace('##', ''))
return token_idx
def polarity_index(polarity):
return polarity_indices[polarity]
class BertDataset(Dataset):
def __init__(self, xml_file):
tree = ET.parse(xml_file)
self.data = []
for sentence in tree.getroot():
text = sentence.find('text').text
aspect_terms = sentence.find('aspectTerms')
if aspect_terms:
for term in aspect_terms:
char_from = int(term.attrib['from'])
char_to = int(term.attrib['to']) - 1
polarity = term.attrib['polarity']
self.data.append((text, char_from, char_to, polarity))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text, char_from, char_to, polarity_str = self.data[idx]
tokens = tokenizer.tokenize(text)
idx_from = token_for_char(char_from, text, tokens)
idx_to = token_for_char(char_to, text, tokens)
polarity = polarity_index(polarity_str)
return {'tokens': tokens, 'from': idx_from, 'to': idx_to, 'polarity': polarity}
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import *
HIDDEN_OUTPUT_FEATURES = 768
TRAINED_WEIGHTS = 'bert-base-uncased'
class TDBertNet(nn.Module):
def __init__(self, num_class):
super(TDBertNet, self).__init__()
self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS)
self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, num_class) # n of hidden features, n of output labels
def forward(self, texts, target_indices):
# BERT
bert_output = self.bert_base(**texts)[0]
# max pooling at target locations
target_outputs = torch.gather(bert_output, dim=1, index=target_indices)
pooled_output = torch.max(target_outputs, dim=1)[0]
# fc layer
x = self.fc(pooled_output)
return x
......@@ -6,8 +6,8 @@ import pickle
from argument import *
from functools import reduce
class Agent:
class Agent:
sentiment_threshold = 0.95
review_tokenizer = ReviewTokenizer()
......@@ -22,7 +22,8 @@ class Agent:
sentences = sent_tokenize(review_body)
phrases = []
for sentence in sentences:
phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', sentence)
phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />',
sentence)
return phrases
# analyze sentiment
......@@ -65,7 +66,7 @@ class Agent:
if abs(sentiment) > self.sentiment_threshold:
for argument in arguments:
if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)):
votes[argument] = sentiment # what if there's two phrases with same argument?
votes[argument] = sentiment # what if there's two phrases with same argument?
vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment}
# normalize votes to 1 (+) or -1 (-)
for argument in votes:
......@@ -138,14 +139,15 @@ class Agent:
attacker_strengths.append(strengths[child])
elif child in qbaf['supporters'][argument]:
supporter_strengths.append(strengths[child])
strengths[argument] = self.argument_strength(qbaf['base_scores'][argument], attacker_strengths, supporter_strengths)
strengths[argument] = self.argument_strength(qbaf['base_scores'][argument], attacker_strengths,
supporter_strengths)
return strengths
def analyze_reviews(self, reviews):
# get ra
self.ra = []
self.vote_sum = {argument : 0 for argument in arguments}
self.vote_phrases = {argument : [] for argument in arguments}
self.vote_sum = {argument: 0 for argument in arguments}
self.vote_phrases = {argument: [] for argument in arguments}
voting_reviews = 0
review_count = 0
for _, review in reviews.iterrows():
......@@ -175,25 +177,27 @@ class Agent:
print(self.strengths)
print('votes:')
for argument in arguments:
print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)), len(self.attacking_phrases(argument))))
print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)),
len(self.attacking_phrases(argument))))
print(argument, 'augmented sum: {}'.format(self.vote_sum[argument]))
def get_strongest_supporting_subfeature(self, argument):
supporters = self.qbaf['supporters'][argument]
if len(supporters) == 0:
return None
supporter_strengths = {s : self.strengths[s] for s in supporters}
supporter_strengths = {s: self.strengths[s] for s in supporters}
return max(supporter_strengths, key=supporter_strengths.get)
def get_strongest_attacking_subfeature(self, argument):
attackers = self.qbaf['attackers'][argument]
if len(attackers) == 0:
return None
attacker_strengths = {a : self.strengths[a] for a in attackers}
attacker_strengths = {a: self.strengths[a] for a in attackers}
return max(attacker_strengths, key=attacker_strengths.get)
def liked_argument(self, argument):
return self.vote_sum[argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
return self.vote_sum[
argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
def supported_argument(self, argument):
return (self.get_strongest_supporting_subfeature(argument) != None and
......@@ -204,13 +208,13 @@ class Agent:
self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0)
def best_supporting_phrase(self, argument):
phrases = {vp['phrase'] : vp['sentiment'] for vp in self.supporting_phrases(argument)}
phrases = {vp['phrase']: vp['sentiment'] for vp in self.supporting_phrases(argument)}
if len(phrases) == 0:
return None
return max(phrases, key=phrases.get)
def best_attacking_phrase(self, argument):
phrases = {vp['phrase'] : vp['sentiment'] for vp in self.attacking_phrases(argument)}
phrases = {vp['phrase']: vp['sentiment'] for vp in self.attacking_phrases(argument)}
if len(phrases) == 0:
return None
return min(phrases, key=phrases.get)
......
......@@ -3,7 +3,6 @@ import math
from nltk.tokenize import TweetTokenizer
import os
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from datetime import datetime
from xml.dom import minidom
import nltk.data
from stanfordcorenlp import StanfordCoreNLP
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment