diff --git a/ADA/.gitignore b/ADA/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..bdf8f7fc838a41037ab60f9e19f8eaa2ce83ba0a --- /dev/null +++ b/ADA/.gitignore @@ -0,0 +1,2 @@ +.idea +*.pt \ No newline at end of file diff --git a/ADA/SA/bert_analyzer.py b/ADA/SA/bert_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..9bb7755ff54469a10f869f83662572109606da4a --- /dev/null +++ b/ADA/SA/bert_analyzer.py @@ -0,0 +1,96 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader +from tdbertnet import TDBertNet +from bert_dataset import BertDataset, polarity_indices, generate_batch +import time +import numpy as np +from sklearn import metrics + +semeval_2014_train_path = 'data/SemEval-2014/Laptop_Train_v2.xml' +semeval_2014_test_path = 'data/SemEval-2014/Laptops_Test_Gold.xml' +trained_model_path = 'semeval_2014.pt' + +BATCH_SIZE = 32 +MAX_EPOCHS = 6 +LEARNING_RATE = 0.00002 +loss_criterion = nn.CrossEntropyLoss() + + +def loss(outputs, labels): + return loss_criterion(outputs, labels) + +class BertAnalyzer: + + def load_saved(self): + self.net = TDBertNet(len(polarity_indices)) + self.net.load_state_dict(torch.load(trained_model_path)) + self.net.eval() + + def train(self): + train_data = BertDataset(semeval_2014_train_path) + train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, + collate_fn=generate_batch) + + self.net = TDBertNet(len(polarity_indices)) + optimiser = optim.Adam(net.parameters(), lr=LEARNING_RATE) + + start = time.time() + + for epoch in range(MAX_EPOCHS): + batch_loss = 0.0 + for i, (texts, target_indices, labels) in enumerate(train_loader): + # zero param gradients + optimiser.zero_grad() + + # forward pass + outputs = self.net(texts, target_indices) + + # backward pass + l = loss(outputs, labels) + l.backward() + + # optimise + optimiser.step() + + # print interim stats every 10 batches + batch_loss += l.item() + if i % 10 == 9: + print('epoch:', epoch + 1, '-- batch:', i + 1, '-- avg loss:', batch_loss / 10) + batch_loss = 0.0 + + end = time.time() + print('Training took', end - start, 'seconds') + + torch.save(net.state_dict(), trained_model_path) + + def evaluate(self): + test_data = BertDataset(semeval_2014_test_path) + test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, + collate_fn=generate_batch) + + predicted = [] + truths = [] + with torch.no_grad(): + for (texts, target_indices, labels) in test_loader: + outputs = self.net(texts, target_indices) + _, pred = torch.max(outputs.data, 1) + predicted += pred.tolist() + truths += labels.tolist() + + correct = (np.array(predicted) == np.array(truths)) + accuracy = correct.sum() / correct.size + print('accuracy:', accuracy) + + cm = metrics.confusion_matrix(truths, predicted, labels=range(len(polarity_indices))) + print('confusion matrix:') + print(cm) + + f1 = metrics.f1_score(truths, predicted, labels=range(len(polarity_indices)), average='macro') + print('macro F1:', f1) + + +sentiment_analyzer = BertAnalyzer() +sentiment_analyzer.load_saved() +sentiment_analyzer.evaluate() \ No newline at end of file diff --git a/ADA/SA/bert_dataset.py b/ADA/SA/bert_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a6bde144a68503b717254820423aa18436f5620e --- /dev/null +++ b/ADA/SA/bert_dataset.py @@ -0,0 +1,71 @@ +import torch +from torch.utils.data import Dataset +import xml.etree.ElementTree as ET +from transformers import * +from tdbertnet import TRAINED_WEIGHTS, HIDDEN_OUTPUT_FEATURES +import re + +MAX_SEQ_LEN = 128 +polarity_indices = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3} +tokenizer = BertTokenizer.from_pretrained(TRAINED_WEIGHTS) + + +def generate_batch(batch): + texts = tokenizer.batch_encode_plus([entry['tokens'] for entry in batch], add_special_tokens=True, + max_length=MAX_SEQ_LEN, pad_to_max_length=True, is_pretokenized=True, + return_tensors='pt') + + max_tg_len = max(entry['to'] - entry['from'] for entry in batch) + target_indices = torch.tensor([[[min(t, entry['to'])] * HIDDEN_OUTPUT_FEATURES + for t in range(entry['from'], entry['from'] + max_tg_len + 1)] + for entry in batch]) + + polarity_labels = torch.tensor([entry['polarity'] for entry in batch]) + + return texts, target_indices, polarity_labels + + +def token_for_char(char_idx, text, tokens): + compressed_idx = len(re.sub(r'\s+', '', text)[:char_idx+1]) - 1 + + token_idx = -1 + while compressed_idx >= 0: + token_idx += 1 + compressed_idx -= len(tokens[token_idx].replace('##', '')) + + return token_idx + + +def polarity_index(polarity): + return polarity_indices[polarity] + + +class BertDataset(Dataset): + + def __init__(self, xml_file): + tree = ET.parse(xml_file) + + self.data = [] + + for sentence in tree.getroot(): + text = sentence.find('text').text + aspect_terms = sentence.find('aspectTerms') + if aspect_terms: + for term in aspect_terms: + char_from = int(term.attrib['from']) + char_to = int(term.attrib['to']) - 1 + polarity = term.attrib['polarity'] + self.data.append((text, char_from, char_to, polarity)) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + text, char_from, char_to, polarity_str = self.data[idx] + + tokens = tokenizer.tokenize(text) + idx_from = token_for_char(char_from, text, tokens) + idx_to = token_for_char(char_to, text, tokens) + polarity = polarity_index(polarity_str) + + return {'tokens': tokens, 'from': idx_from, 'to': idx_to, 'polarity': polarity} diff --git a/ADA/SA/tdbertnet.py b/ADA/SA/tdbertnet.py new file mode 100644 index 0000000000000000000000000000000000000000..24a69f12868ade1d0ac8661ea11320ff1bdf4465 --- /dev/null +++ b/ADA/SA/tdbertnet.py @@ -0,0 +1,24 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import * + +HIDDEN_OUTPUT_FEATURES = 768 +TRAINED_WEIGHTS = 'bert-base-uncased' + +class TDBertNet(nn.Module): + + def __init__(self, num_class): + super(TDBertNet, self).__init__() + self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS) + self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, num_class) # n of hidden features, n of output labels + + def forward(self, texts, target_indices): + # BERT + bert_output = self.bert_base(**texts)[0] + # max pooling at target locations + target_outputs = torch.gather(bert_output, dim=1, index=target_indices) + pooled_output = torch.max(target_outputs, dim=1)[0] + # fc layer + x = self.fc(pooled_output) + return x diff --git a/ADA/agent.py b/ADA/agent.py index 4ef3a53e4b633ff4b171b5c8cda5b6168e32ee8c..e908879510e50c82f0972d00897c179c72e91d33 100644 --- a/ADA/agent.py +++ b/ADA/agent.py @@ -6,8 +6,8 @@ import pickle from argument import * from functools import reduce -class Agent: +class Agent: sentiment_threshold = 0.95 review_tokenizer = ReviewTokenizer() @@ -22,7 +22,8 @@ class Agent: sentences = sent_tokenize(review_body) phrases = [] for sentence in sentences: - phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', sentence) + phrases += re.split(' but | although | though | otherwise | however | unless | whereas | despite |<br />', + sentence) return phrases # analyze sentiment @@ -65,7 +66,7 @@ class Agent: if abs(sentiment) > self.sentiment_threshold: for argument in arguments: if (argument not in votes) or (abs(votes[argument]) < abs(sentiment)): - votes[argument] = sentiment # what if there's two phrases with same argument? + votes[argument] = sentiment # what if there's two phrases with same argument? vote_phrases[argument] = {'phrase': phrase, 'sentiment': sentiment} # normalize votes to 1 (+) or -1 (-) for argument in votes: @@ -138,14 +139,15 @@ class Agent: attacker_strengths.append(strengths[child]) elif child in qbaf['supporters'][argument]: supporter_strengths.append(strengths[child]) - strengths[argument] = self.argument_strength(qbaf['base_scores'][argument], attacker_strengths, supporter_strengths) + strengths[argument] = self.argument_strength(qbaf['base_scores'][argument], attacker_strengths, + supporter_strengths) return strengths def analyze_reviews(self, reviews): # get ra self.ra = [] - self.vote_sum = {argument : 0 for argument in arguments} - self.vote_phrases = {argument : [] for argument in arguments} + self.vote_sum = {argument: 0 for argument in arguments} + self.vote_phrases = {argument: [] for argument in arguments} voting_reviews = 0 review_count = 0 for _, review in reviews.iterrows(): @@ -175,25 +177,27 @@ class Agent: print(self.strengths) print('votes:') for argument in arguments: - print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)), len(self.attacking_phrases(argument)))) + print(argument, 'direct: {} positive, {} negative'.format(len(self.supporting_phrases(argument)), + len(self.attacking_phrases(argument)))) print(argument, 'augmented sum: {}'.format(self.vote_sum[argument])) def get_strongest_supporting_subfeature(self, argument): supporters = self.qbaf['supporters'][argument] if len(supporters) == 0: return None - supporter_strengths = {s : self.strengths[s] for s in supporters} + supporter_strengths = {s: self.strengths[s] for s in supporters} return max(supporter_strengths, key=supporter_strengths.get) def get_strongest_attacking_subfeature(self, argument): attackers = self.qbaf['attackers'][argument] if len(attackers) == 0: return None - attacker_strengths = {a : self.strengths[a] for a in attackers} + attacker_strengths = {a: self.strengths[a] for a in attackers} return max(attacker_strengths, key=attacker_strengths.get) def liked_argument(self, argument): - return self.vote_sum[argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument)) + return self.vote_sum[ + argument] >= 0 # len(self.supporting_phrases(argument)) >= len(self.attacking_phrases(argument)) def supported_argument(self, argument): return (self.get_strongest_supporting_subfeature(argument) != None and @@ -204,13 +208,13 @@ class Agent: self.strengths[self.get_strongest_attacking_subfeature(argument)] > 0) def best_supporting_phrase(self, argument): - phrases = {vp['phrase'] : vp['sentiment'] for vp in self.supporting_phrases(argument)} + phrases = {vp['phrase']: vp['sentiment'] for vp in self.supporting_phrases(argument)} if len(phrases) == 0: return None return max(phrases, key=phrases.get) def best_attacking_phrase(self, argument): - phrases = {vp['phrase'] : vp['sentiment'] for vp in self.attacking_phrases(argument)} + phrases = {vp['phrase']: vp['sentiment'] for vp in self.attacking_phrases(argument)} if len(phrases) == 0: return None return min(phrases, key=phrases.get) diff --git a/ADA/review_annotation.py b/ADA/review_annotation.py index 2b7455bdf0d777c87131bf67cb54ba931ac2461a..8c9d68549a394f2fb3a6fe2084f52d44b8df6de6 100644 --- a/ADA/review_annotation.py +++ b/ADA/review_annotation.py @@ -3,7 +3,6 @@ import math from nltk.tokenize import TweetTokenizer import os from xml.etree.ElementTree import ElementTree, parse, tostring, Element, SubElement -from datetime import datetime from xml.dom import minidom import nltk.data from stanfordcorenlp import StanfordCoreNLP