Commit 7d305f80 authored by Se Park's avatar Se Park

Latest version, need to fix dataloader

parent 65fcce63
# CO490 - NLP Course Labs (Spring 2020)
## Lab Notebooks
- **(16/01/2020) Lab 1:** Pre-processing and word representations [(Open in Colab)](https://colab.research.google.com/github/ImperialNLP/NLPLabs/blob/master/lab01/preprocessing_and_embeddings.ipynb)
- **(23/01/2020) Lab 2:** Text Classification: Sentiment Analysis [(Open in Colab)](https://colab.research.google.com/github/ImperialNLP/NLPLabs/blob/master/lab02/sentiment_classification.ipynb)
- **(30/01/2020) Lab 3:** Language Modelling
- Part I: N-gram modelling [(Open in Colab)](https://colab.research.google.com/github/ImperialNLP/NLPLabs/blob/master/lab03/ngram_lm.ipynb)
- Part II: Neural language models [(Open in Colab)](https://colab.research.google.com/github/ImperialNLP/NLPLabs/blob/master/lab03/neural_lm.ipynb)
- **(06/02/2020) Lab 4:** Part of Speech Tagging [(Open in Colab)](https://colab.research.google.com/github/ImperialNLP/NLPLabs/blob/master/lab04/POStagging.ipynb)
## Coursework
05/02/2020: A baseline model for the coursework has been [added](/coursework/baseline.ipynb) [(Open in Colab)](https://colab.research.google.com/github/ImperialNLP/NLPLabs/blob/master/coursework/baseline.ipynb)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, AlbertTokenizer
class Data(object):
"""A single training/test example for the dataset."""
def __init__(self, src, mt, score):
self.src = src
self.mt = mt
self.score = score
def __str__(self):
return self.__repr__()
def __repr__(self):
l = ["src: {}".format(self.src), "mt: {}".format(self.mt), "label: {}".format(self.score)]
return ", ".join(l)
class LoadData(Dataset):
def __init__(self, src_file, mt_file, score_file, maxlen):
with open(src_file, 'r', encoding='utf-8') as f:
src_sentences = f.readlines()
with open(mt_file, 'r', encoding='utf-8') as f:
mt_sentences = f.readlines()
with open(score_file, 'r', encoding='utf-8') as f:
scores = f.readlines()
self.data = [Data(src=s.strip(), mt=m.strip(), score=float(h.strip()))
for s, m, h in zip(src_sentences, mt_sentences, scores)]
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
self.maxlen = maxlen
def __len__(self):
return len(self.data)
def __getitem__(self, index):
# Selecting the sentence and label at the specified index in the data frame
src = self.data[index].src
mt = self.data[index].mt
score = self.data[index].score
# Preprocessing the text to be suitable for BERT
# Tokenize the sentence
src_tokens = self.tokenizer.tokenize(src)
mt_tokens = self.tokenizer.tokenize(mt)
# Insering the CLS and SEP token in the beginning and end of the sentence
tokens = ["[CLS]"] + src_tokens + ["[SEP]"] + mt_tokens + ["[SEP]"]
segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)
if len(tokens) < self.maxlen:
# Padding sentences
tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
segment_ids = segment_ids + [0] * (self.maxlen - len(segment_ids))
else:
# Prunning the list to be of specified max length
tokens = tokens[:self.maxlen-1] + ['[SEP]']
segment_ids = segment_ids[:self.maxlen]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
# Converting the list to a pytorch tensor
tokens_ids_tensor = torch.tensor(tokens_ids)
segment_ids_tensor = torch.tensor(segment_ids, dtype=torch.long)
# Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
attn_mask = (tokens_ids_tensor!=0).long()
return tokens_ids_tensor, segment_ids_tensor, attn_mask, score
......@@ -4,7 +4,7 @@ import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from model import QualityEstimation
from dataloader import LoadData
from dataloader import Data, LoadData
from pathlib import Path
def set_seed(seed=123):
......@@ -18,9 +18,9 @@ def evaluate(model, loss_fn, dataloader, device):
pred, ref = np.array([]), np.array([])
count = 0
with torch.no_grad():
for seq, attn_masks, labels in dataloader:
seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
qe_scores = model(seq, attn_masks)
for token_ids, segment_ids, attn_masks, labels in dataloader:
token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)
qe_scores = model(token_ids, segment_ids, attn_masks)
loss = loss_fn(qe_scores, labels)
qe_scores = qe_scores.detach().cpu().numpy()
......@@ -35,21 +35,21 @@ def evaluate(model, loss_fn, dataloader, device):
eval_loss = eval_loss / count
pearson = np.corrcoef(pred, ref)[0, 1]
return eval_loss, pearson
def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device):
best_acc = 0
best_pearson = -float('inf')
for ep in range(num_epoch):
print('======= Epoch {:} ======='.format(ep))
for it, (seq, attn_masks, labels) in enumerate(train_loader):
for it, (token_ids, segment_ids, attn_masks, labels) in enumerate(train_loader):
# Clear gradients
optimizer.zero_grad()
# Converting these to cuda tensors
seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)
# Obtaining scores from the model
qe_scores = model(seq, attn_masks)
qe_scores = model(token_ids, segment_ids, attn_masks)
# Computing loss
loss = loss_fn(qe_scores, labels)
# Backpropagating the gradients
......@@ -59,15 +59,14 @@ def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device
optimizer.step()
if it % 100 == 0 and not it == 0:
acc = get_accuracy_from_logits(logits, labels)
print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it, ep, loss.item(), acc))
print("Iteration {} of epoch {} complete".format(it, ep))
val_acc = evaluate(model, loss_fn, val_loader, device)
print("Epoch {} complete! Validation Accuracy : {}".format(ep, val_acc))
if val_acc > best_acc:
print("Best validation accuracy improved from {} to {}, saving model...".format(best_acc, val_acc))
best_acc = val_acc
torch.save(model.state_dict(), '/vol/bitbucket/shp2918/modelNLP.pt')
rmse, pearson = evaluate(model, loss_fn, val_loader, device)
print("Epoch {} complete! RMSE: {}, Pearson: {}".format(ep, rmse, pearson))
if pearson > best_pearson:
print("Best Pearson improved from {} to {}, saving model...".format(best_pearson, pearson))
best_pearson = pearson
torch.save(model.state_dict(), '/vol/bitbucket/shp2918/nlp/modelNLP.pt')
if __name__ == "__main__":
......@@ -84,10 +83,9 @@ if __name__ == "__main__":
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr = 2e-5)
MAX_LEN = 64
st = time.time()
train_set = LoadData(filename=PATH/'data/train.csv', maxlen=MAX_LEN)
val_set = LoadData(filename=PATH/'data/valid.csv', maxlen=MAX_LEN)
MAX_LEN = 128
train_set = LoadData(src_file=PATH/'data/train.ende.src', mt_file=PATH/'data/train.ende.mt', score_file=PATH/'data/train.ende.scores', maxlen=MAX_LEN)
val_set = LoadData(src_file=PATH/'data/dev.ende.src', mt_file=PATH/'data/dev.ende.mt', score_file=PATH/'data/dev.ende.scores', maxlen=MAX_LEN)
train_loader = DataLoader(train_set, batch_size=32, num_workers=5)
val_loader = DataLoader(val_set, batch_size=32, num_workers=5)
......
......@@ -5,7 +5,7 @@ from transformers import BertModel, BertConfig
class QualityEstimation(nn.Module):
def __init__(self, hidden_dim):
super(BertForQualityEstimation, self).__init__()
super(QualityEstimation, self).__init__()
self.hidden_dim = hidden_dim
# Instantiating BERT model object
......@@ -21,14 +21,14 @@ class QualityEstimation(nn.Module):
self.fc2 = nn.Linear(self.hidden_dim, 1)
self.loss = nn.MSELoss()
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
def forward(self, token_ids, segment_ids=None, attention_mask=None):
# Feeding the input to BERT model to obtain contextualized representations
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
flat_token_ids = token_ids.view(-1, token_ids.size(-1))
flat_segment_ids = segment_ids.view(-1, segment_ids.size(-1))
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
encoded_layers, _ = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
encoded_layers, _ = self.bert(flat_token_ids, flat_segment_ids, flat_attention_mask)
encoded_layers = self.dropout(encoded_layers)
output, _ = self.lstm(encoded_layers)
output = torch.tanh(self.fc1(output[:,-1,:]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment