Skip to content
Snippets Groups Projects
Commit a5eed89b authored by Se Park's avatar Se Park
Browse files

Latest BERT model

parent 04f49c4f
Branches master
No related tags found
No related merge requests found
%% Cell type:code id: tags:
```
! pip install transformers
```
%% Output
Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (2.5.1)
Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.38)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)
Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.28.1)
Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.11.15)
Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.85)
Requirement already satisfied: tokenizers==0.5.2 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.5.2)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.17.5)
Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)
Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.0)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2019.11.28)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)
Requirement already satisfied: botocore<1.15.0,>=1.14.15 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.14.15)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.4)
Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (0.15.2)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (2.6.1)
%% Cell type:code id: tags:
```
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
from pathlib import Path
import os
```
%% Cell type:code id: tags:
```
from os.path import exists
if not exists('ende_data.zip'):
!wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
!unzip ende_data.zip
```
%% Cell type:code id: tags:
```
class Data(object):
"""A single training/test example for the dataset."""
def __init__(self, src, mt, score=None):
self.src = src
self.mt = mt
self.score = score
def __str__(self):
return self.__repr__()
def __repr__(self):
l = ["src: {}".format(self.src), "mt: {}".format(self.mt)]
if self.score is not None:
l.append("label: {}".format(self.score))
return ", ".join(l)
class LoadData(Dataset):
def __init__(self, maxlen, src_file, mt_file, score_file=None):
self.score_file = score_file
with open(src_file, 'r', encoding='utf-8') as f:
src_sentences = f.readlines()
with open(mt_file, 'r', encoding='utf-8') as f:
mt_sentences = f.readlines()
if self.score_file is not None:
with open(score_file, 'r', encoding='utf-8') as f:
scores = f.readlines()
self.data = [Data(src=s.strip(), mt=m.strip(), score=float(h.strip()))
for s, m, h in zip(src_sentences, mt_sentences, scores)]
else:
self.data = [Data(src=s.strip(), mt=m.strip())
for s, m in zip(src_sentences, mt_sentences)]
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
self.maxlen = maxlen
def __len__(self):
return len(self.data)
def __getitem__(self, index):
# Selecting the sentence and label at the specified index in the data frame
src = self.data[index].src
mt = self.data[index].mt
score = self.data[index].score
# Preprocessing the text to be suitable for BERT
# Tokenize the sentence
src_tokens = self.tokenizer.tokenize(src)
mt_tokens = self.tokenizer.tokenize(mt)
# Insering the CLS and SEP token in the beginning and end of the sentence
tokens = ["[CLS]"] + src_tokens + ["[SEP]"] + mt_tokens + ["[SEP]"]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)
attn_mask = [1]*len(tokens_ids)
if len(tokens) < self.maxlen:
# Padding sentences
padding = [0] * (self.maxlen-len(tokens_ids))
tokens_ids += padding
segment_ids += padding
attn_mask += padding
else:
# Prunning the list to be of specified max length
tokens_ids = tokens_ids[:self.maxlen]
segment_ids = segment_ids[:self.maxlen]
attn_mask = attn_mask[:self.maxlen]
# Converting the list to a pytorch tensor
tokens_ids = torch.tensor(tokens_ids, dtype=torch.long)
segment_ids = torch.tensor(segment_ids, dtype=torch.long)
attn_mask = torch.tensor(attn_mask, dtype=torch.long)
if self.score_file is not None:
score = torch.tensor(score, dtype=torch.float)
score = torch.tanh(score)
return tokens_ids, attn_mask, segment_ids, score
return tokens_ids, attn_mask, segment_ids
```
%% Cell type:code id: tags:
```
class QualityEstimation(nn.Module):
def __init__(self, hidden_dim):
super(QualityEstimation, self).__init__()
self.hidden_dim = hidden_dim
# Instantiating BERT model object
config = BertConfig()
self.bert = BertModel(config).from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(0.25)
# LSTM and classification layers
self.lstm = nn.LSTM(input_size=768, hidden_size=self.hidden_dim,
num_layers=1, batch_first=True,
dropout=0, bidirectional=False)
self.fc1 = nn.Linear(self.hidden_dim, 1)
nn.init.kaiming_normal_(self.fc1.weight)
def forward(self, token_ids, segment_ids=None, attention_mask=None):
encoded_layers, _ = self.bert(input_ids=token_ids, token_type_ids=segment_ids, attention_mask=attention_mask)
encoded_layers = self.dropout(encoded_layers)
output, _ = self.lstm(encoded_layers)
qe_scores = self.fc1(output[:,-1,:])
qe_scores = torch.tanh(qe_scores)
return qe_scores
```
%% Cell type:code id: tags:
```
def set_seed(seed=123):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
def evaluate(model, loss_fn, dataloader, device):
model.eval()
eval_loss = 0
pred, ref = np.array([]), np.array([])
count = 0
with torch.no_grad():
for token_ids, segment_ids, attn_masks, labels in dataloader:
token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)
qe_scores = model(token_ids, segment_ids, attn_masks)
loss = loss_fn(qe_scores.view(-1), labels.view(-1))
qe_scores = qe_scores.detach().cpu().numpy()
qe_scores = qe_scores.reshape((qe_scores.shape[0],))
labels = labels.to('cpu').numpy()
pred = np.concatenate((pred, qe_scores))
ref = np.concatenate((ref, labels))
eval_loss += loss.item()
count += 1
eval_loss = eval_loss / count
pearson = np.corrcoef(pred, ref)[0, 1]
return eval_loss, pearson
def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler=None):
best_pearson = -float('inf')
for ep in range(num_epoch):
print('======= Epoch {:} ======='.format(ep))
for it, (token_ids, segment_ids, attn_masks, labels) in enumerate(train_loader):
model.train()
# Clear gradients
optimizer.zero_grad()
# Converting these to cuda tensors
token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)
# Obtaining scores from the model
qe_scores = model(token_ids, segment_ids, attn_masks)
# Computing loss
loss = loss_fn(qe_scores.view(-1), labels.view(-1))
# Backpropagating the gradients
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Optimization step
optimizer.step()
if it % 100 == 0 and not it == 0:
print("Iteration {} of epoch {} complete".format(it, ep))
rmse, pearson = evaluate(model, loss_fn, val_loader, device)
print("Epoch {} complete! RMSE: {}, Pearson: {}".format(ep, rmse, pearson))
if pearson > best_pearson:
print("Best Pearson improved from {} to {}, saving model...".format(best_pearson, pearson))
best_pearson = pearson
torch.save(model.state_dict(), 'modelNLP.pt')
if scheduler is not None:
scheduler.step()
```
%% Cell type:code id: tags:
```
PATH = Path("./")
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
print("Using GPU: {}".format(use_cuda))
set_seed()
model = QualityEstimation(hidden_dim=128)
model.cuda()
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.00002)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)
MAX_LEN = 80
train_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'train.ende.src', mt_file=PATH/'train.ende.mt', score_file=PATH/'train.ende.scores')
val_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'dev.ende.src', mt_file=PATH/'dev.ende.mt', score_file=PATH/'dev.ende.scores')
train_loader = DataLoader(train_set, batch_size=32)
val_loader = DataLoader(val_set, batch_size=32)
num_epoch = 4
train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler)
```
%% Output
Using GPU: True
======= Epoch 0 =======
Iteration 100 of epoch 0 complete
Iteration 200 of epoch 0 complete
Epoch 0 complete! RMSE: 0.20935339806601405, Pearson: 0.14687585604000064
Best Pearson improved from -inf to 0.14687585604000064, saving model...
======= Epoch 1 =======
Iteration 100 of epoch 1 complete
Iteration 200 of epoch 1 complete
Epoch 1 complete! RMSE: 0.20592319779098034, Pearson: 0.209848575646105
Best Pearson improved from 0.14687585604000064 to 0.209848575646105, saving model...
======= Epoch 2 =======
Iteration 100 of epoch 2 complete
Iteration 200 of epoch 2 complete
Epoch 2 complete! RMSE: 0.21317375591024756, Pearson: 0.17407713168949893
======= Epoch 3 =======
Iteration 100 of epoch 3 complete
Iteration 200 of epoch 3 complete
Epoch 3 complete! RMSE: 0.2287141541019082, Pearson: 0.15072615365604347
%% Cell type:code id: tags:
```
def writeScores(method_name,scores):
fn = "predictions.txt"
print("")
with open(fn, 'w') as output_file:
for idx,x in enumerate(scores):
output_file.write(f"{x}\n")
```
%% Cell type:code id: tags:
```
test_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'test.ende.src', mt_file=PATH/'test.ende.mt')
test_loader = DataLoader(test_set, batch_size=len(test_set))
model = QualityEstimation(hidden_dim=128)
model.load_state_dict(torch.load('modelNLP.pt'))
model.cuda()
print('Start testing...')
model.eval()
with torch.no_grad():
for token_ids, segment_ids, attn_masks in test_loader:
# Converting these to cuda tensors
token_ids, segment_ids, attn_masks = token_ids.to(device), segment_ids.to(device), attn_masks.to(device)
# Obtaining scores from the model
qe_scores = model(token_ids, segment_ids, attn_masks)
print('Testing finished!')
qe_scores = [np.arctanh(scores[0]) for scores in qe_scores.detach().cpu().numpy()]
```
%% Output
Start testing...
Testing finished!
%% Cell type:code id: tags:
```
from google.colab import files
from zipfile import ZipFile
writeScores("BERT",qe_scores)
with ZipFile("en-de_svr.zip","w") as newzip:
newzip.write("predictions.txt")
files.download('en-de_svr.zip')
```
%% Output
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment