Commit 52741466 authored by Joel Oksanen's avatar Joel Oksanen
Browse files

Implemented target-dependent BERT which achieves results equivalent to state...

Implemented target-dependent BERT which achieves results equivalent to state of the art reported results.
parent 7718ad0b
.idea
*.pt
\ No newline at end of file
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tdbertnet import TDBertNet
from bert_dataset import BertDataset, polarity_indices, generate_batch
import time
import numpy as np
from sklearn import metrics
semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml'
semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml'
trained_model_path = 'semeval_2014.pt'
BATCH_SIZE = 32
MAX_EPOCHS = 6
LEARNING_RATE = 0.00002
loss_criterion = nn.CrossEntropyLoss()
def loss(outputs, labels):
return loss_criterion(outputs, labels)
class BertAnalyzer:
def load_saved(self):
self.net = TDBertNet(len(polarity_indices))
self.net.load_state_dict(torch.load(trained_model_path))
self.net.eval()
def train(self):
train_data = BertDataset(semeval_2014_train_path)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
collate_fn=generate_batch)
self.net = TDBertNet(len(polarity_indices))
optimiser = optim.Adam(net.parameters(), lr=LEARNING_RATE)
start = time.time()
for epoch in range(MAX_EPOCHS):
batch_loss = 0.0
for i, (texts, target_indices, labels) in enumerate(train_loader):
# zero param gradients
optimiser.zero_grad()
# forward pass
outputs = self.net(texts, target_indices)
# backward pass
l = loss(outputs, labels)
l.backward()
# optimise
optimiser.step()
# print interim stats every 10 batches
batch_loss += l.item()
if i % 10 == 9:
print('epoch:', epoch + 1, '-- batch:', i + 1, '-- avg loss:', batch_loss / 10)
batch_loss = 0.0
end = time.time()
print('Training took', end - start, 'seconds')
torch.save(net.state_dict(), trained_model_path)
def evaluate(self):
test_data = BertDataset(semeval_2014_test_path)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4,
collate_fn=generate_batch)
predicted = []
truths = []
with torch.no_grad():
for (texts, target_indices, labels) in test_loader:
outputs = self.net(texts, target_indices)
_, pred = torch.max(outputs.data, 1)
predicted += pred.tolist()
truths += labels.tolist()
correct = (np.array(predicted) == np.array(truths))
accuracy = correct.sum() / correct.size
print('accuracy:', accuracy)
cm = metrics.confusion_matrix(truths, predicted, labels=range(len(polarity_indices)))
print('confusion matrix:')
print(cm)
f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro')
print('macro F1:', f1)
sentiment_analyzer = BertAnalyzer()
sentiment_analyzer.load_saved()
sentiment_analyzer.evaluate()
\ No newline at end of file
import torch
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET
from transformers import *
from tdbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES
import re
MAX_SEQ_LEN = 128
polarity_indices = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}
tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS)
def generate_batch(batch):
texts = tokenizer.batch_encode_plus([entry['tokens'] for entry in batch], add_special_tokens=True,
max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True,
return_tensors='pt')
max_tg_len = max(entry['to'] - entry['from'] for entry in batch)
target_indices = torch.tensor([[[min(t, entry['to'])] * HIDDEN_OUTPUT_FEATURES
for t in range(entry['from'], entry['from'] + max_tg_len + 1)]
for entry in batch])
polarity_labels = torch.tensor([entry['polarity'] for entry in batch])
return texts, target_indices, polarity_labels
def token_for_char(char_idx, text, tokens):
compressed_idx = len(re.sub(r'\s+', '', text)[:char_idx+1]) - 1
token_idx = -1
while compressed_idx >= 0:
token_idx += 1
compressed_idx -= len(tokens[token_idx].replace('##', ''))
return token_idx
def polarity_index(polarity):
return polarity_indices[polarity]
class BertDataset(Dataset):
def __init__(self, xml_file):
tree = ET.parse(xml_file)
self.data = []
for sentence in tree.getroot():
text = sentence.find('text').text
aspect_terms = sentence.find('aspectTerms')
if aspect_terms:
for term in aspect_terms:
char_from = int(term.attrib['from'])
char_to = int(term.attrib['to']) - 1
polarity = term.attrib['polarity']
self.data.append((text, char_from, char_to, polarity))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text, char_from, char_to, polarity_str = self.data[idx]
tokens = tokenizer.tokenize(text)
idx_from = token_for_char(char_from, text, tokens)
idx_to = token_for_char(char_to, text, tokens)
polarity = polarity_index(polarity_str)
return {'tokens': tokens, 'from': idx_from, 'to': idx_to, 'polarity': polarity}
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import *
HIDDEN_OUTPUT_FEATURES = 768
TRAINED_WEIGHTS = 'bert-base-uncased'
class TDBertNet(nn.Module):
def __init__(self, num_class):
super(TDBertNet, self).__init__()
self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS)
self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, num_class) # n of hidden features, n of output labels
def forward(self, texts, target_indices):
# BERT
bert_output = self.bert_base(**texts)[0]
# max pooling at target locations
target_outputs = torch.gather(bert_output, dim=1, index=target_indices)
pooled_output = torch.max(target_outputs, dim=1)[0]
# fc layer
x = self.fc(pooled_output)
return x
...@@ -6,8 +6,8 @@ import pickle ...@@ -6,8 +6,8 @@ import pickle
from argument import * from argument import *
from functools import reduce from functools import reduce
class Agent:
class Agent:
sentiment_threshold = 0.95 sentiment_threshold = 0.95
review_tokenizer = ReviewTokenizer() review_tokenizer = ReviewTokenizer()
...@@ -22,7 +22,8 @@ class Agent: ...@@ -22,7 +22,8 @@ class Agent:
sentences = sent_tokenize(review_body) sentences = sent_tokenize(review_body)
phrases = [] phrases = []
for sentence in sentences: for sentence in sentences:
phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', sentence) phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />',
sentence)
return phrases return phrases
# analyze sentiment # analyze sentiment
...@@ -65,7 +66,7 @@ class Agent: ...@@ -65,7 +66,7 @@ class Agent:
if abs(sentiment) > self.sentiment_threshold: if abs(sentiment) > self.sentiment_threshold:
for argument in arguments: for argument in arguments:
if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)): if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)):
votes[argument] = sentiment # what if there's two phrases with same argument? votes[argument] = sentiment # what if there's two phrases with same argument?
vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment} vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment}
# normalize votes to 1 (+) or -1 (-) # normalize votes to 1 (+) or -1 (-)
for argument in votes: for argument in votes:
...@@ -138,14 +139,15 @@ class Agent: ...@@ -138,14 +139,15 @@ class Agent:
attacker_strengths.append(strengths[child]) attacker_strengths.append(strengths[child])
elif child in qbaf['supporters'][argument]: elif child in qbaf['supporters'][argument]:
supporter_strengths.append(strengths[child]) supporter_strengths.append(strengths[child])
strengths[argument] = self.argument_strength(qbaf['base_scores'][argument], attacker_strengths, supporter_strengths) strengths[argument] = self.argument_strength(qbaf['base_scores'][argument], attacker_strengths,
supporter_strengths)
return strengths return strengths
def analyze_reviews(self, reviews): def analyze_reviews(self, reviews):
# get ra # get ra
self.ra = [] self.ra = []
self.vote_sum = {argument : 0 for argument in arguments} self.vote_sum = {argument: 0 for argument in arguments}
self.vote_phrases = {argument : [] for argument in arguments} self.vote_phrases = {argument: [] for argument in arguments}
voting_reviews = 0 voting_reviews = 0
review_count = 0 review_count = 0
for _, review in reviews.iterrows(): for _, review in reviews.iterrows():
...@@ -175,25 +177,27 @@ class Agent: ...@@ -175,25 +177,27 @@ class Agent:
print(self.strengths) print(self.strengths)
print('votes:') print('votes:')
for argument in arguments: for argument in arguments:
print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)), len(self.attacking_phrases(argument)))) print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)),
len(self.attacking_phrases(argument))))
print(argument, 'augmented sum: {}'.format(self.vote_sum[argument])) print(argument, 'augmented sum: {}'.format(self.vote_sum[argument]))
def get_strongest_supporting_subfeature(self, argument): def get_strongest_supporting_subfeature(self, argument):
supporters = self.qbaf['supporters'][argument] supporters = self.qbaf['supporters'][argument]
if len(supporters) == 0: if len(supporters) == 0:
return None return None
supporter_strengths = {s : self.strengths[s] for s in supporters} supporter_strengths = {s: self.strengths[s] for s in supporters}
return max(supporter_strengths, key=supporter_strengths.get) return max(supporter_strengths, key=supporter_strengths.get)
def get_strongest_attacking_subfeature(self, argument): def get_strongest_attacking_subfeature(self, argument):
attackers = self.qbaf['attackers'][argument] attackers = self.qbaf['attackers'][argument]
if len(attackers) == 0: if len(attackers) == 0:
return None return None
attacker_strengths = {a : self.strengths[a] for a in attackers} attacker_strengths = {a: self.strengths[a] for a in attackers}
return max(attacker_strengths, key=attacker_strengths.get) return max(attacker_strengths, key=attacker_strengths.get)
def liked_argument(self, argument): def liked_argument(self, argument):
return self.vote_sum[argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument)) return self.vote_sum[
argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument))
def supported_argument(self, argument): def supported_argument(self, argument):
return (self.get_strongest_supporting_subfeature(argument) != None and return (self.get_strongest_supporting_subfeature(argument) != None and
...@@ -204,13 +208,13 @@ class Agent: ...@@ -204,13 +208,13 @@ class Agent:
self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0) self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0)
def best_supporting_phrase(self, argument): def best_supporting_phrase(self, argument):
phrases = {vp['phrase'] : vp['sentiment'] for vp in self.supporting_phrases(argument)} phrases = {vp['phrase']: vp['sentiment'] for vp in self.supporting_phrases(argument)}
if len(phrases) == 0: if len(phrases) == 0:
return None return None
return max(phrases, key=phrases.get) return max(phrases, key=phrases.get)
def best_attacking_phrase(self, argument): def best_attacking_phrase(self, argument):
phrases = {vp['phrase'] : vp['sentiment'] for vp in self.attacking_phrases(argument)} phrases = {vp['phrase']: vp['sentiment'] for vp in self.attacking_phrases(argument)}
if len(phrases) == 0: if len(phrases) == 0:
return None return None
return min(phrases, key=phrases.get) return min(phrases, key=phrases.get)
......
...@@ -3,7 +3,6 @@ import math ...@@ -3,7 +3,6 @@ import math
from nltk.tokenize import TweetTokenizer from nltk.tokenize import TweetTokenizer
import os import os
from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement
from datetime import datetime
from xml.dom import minidom from xml.dom import minidom
import nltk.data import nltk.data
from stanfordcorenlp import StanfordCoreNLP from stanfordcorenlp import StanfordCoreNLP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment