%% Cell type:code id: tags:
! pip install transformers
%% Output
Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (2.5.1)
Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.38)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)
Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.28.1)
Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.11.15)
Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.85)
Requirement already satisfied: tokenizers==0.5.2 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.5.2)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.17.5)
Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)
Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.0)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2019.11.28)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)
Requirement already satisfied: botocore<1.15.0,>=1.14.15 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.14.15)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.4)
Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (0.15.2)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (2.6.1)
%% Cell type:code id: tags:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
from pathlib import Path
import os
%% Cell type:code id: tags:
from os.path import exists
if not exists(''):
!wget -O
%% Cell type:code id: tags:
class Data(object):
"""A single training/test example for the dataset."""
def __init__(self, src, mt, score=None):
self.src = src = mt
self.score = score
def __str__(self):
return self.__repr__()
def __repr__(self):
l = ["src: {}".format(self.src), "mt: {}".format(]
if self.score is not None:
l.append("label: {}".format(self.score))
return ", ".join(l)
class LoadData(Dataset):
def __init__(self, maxlen, src_file, mt_file, score_file=None):
self.score_file = score_file
with open(src_file, 'r', encoding='utf-8') as f:
src_sentences = f.readlines()
with open(mt_file, 'r', encoding='utf-8') as f:
mt_sentences = f.readlines()
if self.score_file is not None:
with open(score_file, 'r', encoding='utf-8') as f:
scores = f.readlines() = [Data(src=s.strip(), mt=m.strip(), score=float(h.strip()))
for s, m, h in zip(src_sentences, mt_sentences, scores)]
else: = [Data(src=s.strip(), mt=m.strip())
for s, m in zip(src_sentences, mt_sentences)]
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
self.maxlen = maxlen
def __len__(self):
return len(
def __getitem__(self, index):
# Selecting the sentence and label at the specified index in the data frame
src =[index].src
mt =[index].mt
score =[index].score
# Preprocessing the text to be suitable for BERT
# Tokenize the sentence
src_tokens = self.tokenizer.tokenize(src)
mt_tokens = self.tokenizer.tokenize(mt)
# Insering the CLS and SEP token in the beginning and end of the sentence
tokens = ["[CLS]"] + src_tokens + ["[SEP]"] + mt_tokens + ["[SEP]"]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)
attn_mask = [1]*len(tokens_ids)
if len(tokens) < self.maxlen:
# Padding sentences
padding = [0] * (self.maxlen-len(tokens_ids))
tokens_ids += padding
segment_ids += padding
attn_mask += padding
# Prunning the list to be of specified max length
tokens_ids = tokens_ids[:self.maxlen]
segment_ids = segment_ids[:self.maxlen]
attn_mask = attn_mask[:self.maxlen]
# Converting the list to a pytorch tensor
tokens_ids = torch.tensor(tokens_ids, dtype=torch.long)
segment_ids = torch.tensor(segment_ids, dtype=torch.long)
attn_mask = torch.tensor(attn_mask, dtype=torch.long)
if self.score_file is not None:
score = torch.tensor(score, dtype=torch.float)
score = torch.tanh(score)
return tokens_ids, attn_mask, segment_ids, score
return tokens_ids, attn_mask, segment_ids
%% Cell type:code id: tags:
class QualityEstimation(nn.Module):
def __init__(self, hidden_dim):
super(QualityEstimation, self).__init__()
self.hidden_dim = hidden_dim
# Instantiating BERT model object
config = BertConfig()
self.bert = BertModel(config).from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(0.25)
# LSTM and classification layers
self.lstm = nn.LSTM(input_size=768, hidden_size=self.hidden_dim,
num_layers=1, batch_first=True,
dropout=0, bidirectional=False)
self.fc1 = nn.Linear(self.hidden_dim, 1)
def forward(self, token_ids, segment_ids=None, attention_mask=None):
encoded_layers, _ = self.bert(input_ids=token_ids, token_type_ids=segment_ids, attention_mask=attention_mask)
encoded_layers = self.dropout(encoded_layers)
output, _ = self.lstm(encoded_layers)
qe_scores = self.fc1(output[:,-1,:])
qe_scores = torch.tanh(qe_scores)
return qe_scores
%% Cell type:code id: tags:
def set_seed(seed=123):
def evaluate(model, loss_fn, dataloader, device):
eval_loss = 0
pred, ref = np.array([]), np.array([])
count = 0
with torch.no_grad():
for token_ids, segment_ids, attn_masks, labels in dataloader:
token_ids, segment_ids, attn_masks, labels =,,,
qe_scores = model(token_ids, segment_ids, attn_masks)
loss = loss_fn(qe_scores.view(-1), labels.view(-1))
qe_scores = qe_scores.detach().cpu().numpy()
qe_scores = qe_scores.reshape((qe_scores.shape[0],))
labels ='cpu').numpy()
pred = np.concatenate((pred, qe_scores))
ref = np.concatenate((ref, labels))
eval_loss += loss.item()
count += 1
eval_loss = eval_loss / count
pearson = np.corrcoef(pred, ref)[0, 1]
return eval_loss, pearson
def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler=None):
best_pearson = -float('inf')
for ep in range(num_epoch):
print('======= Epoch {:} ======='.format(ep))
for it, (token_ids, segment_ids, attn_masks, labels) in enumerate(train_loader):
# Clear gradients
# Converting these to cuda tensors
token_ids, segment_ids, attn_masks, labels =,,,
# Obtaining scores from the model
qe_scores = model(token_ids, segment_ids, attn_masks)
# Computing loss
loss = loss_fn(qe_scores.view(-1), labels.view(-1))
# Backpropagating the gradients
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Optimization step
if it % 100 == 0 and not it == 0:
print("Iteration {} of epoch {} complete".format(it, ep))
rmse, pearson = evaluate(model, loss_fn, val_loader, device)
print("Epoch {} complete! RMSE: {}, Pearson: {}".format(ep, rmse, pearson))
if pearson > best_pearson:
print("Best Pearson improved from {} to {}, saving model...".format(best_pearson, pearson))
best_pearson = pearson, '')
if scheduler is not None:
%% Cell type:code id: tags:
PATH = Path("./")
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
print("Using GPU: {}".format(use_cuda))
model = QualityEstimation(hidden_dim=128)
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.00002)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)
MAX_LEN = 80
train_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'train.ende.src', mt_file=PATH/'', score_file=PATH/'train.ende.scores')
val_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'dev.ende.src', mt_file=PATH/'', score_file=PATH/'dev.ende.scores')
train_loader = DataLoader(train_set, batch_size=32)
val_loader = DataLoader(val_set, batch_size=32)
num_epoch = 4
train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler)
%% Output
Using GPU: True
======= Epoch 0 =======
Iteration 100 of epoch 0 complete
Iteration 200 of epoch 0 complete
Epoch 0 complete! RMSE: 0.20935339806601405, Pearson: 0.14687585604000064
Best Pearson improved from -inf to 0.14687585604000064, saving model...
======= Epoch 1 =======
Iteration 100 of epoch 1 complete
Iteration 200 of epoch 1 complete
Epoch 1 complete! RMSE: 0.20592319779098034, Pearson: 0.209848575646105
Best Pearson improved from 0.14687585604000064 to 0.209848575646105, saving model...
======= Epoch 2 =======
Iteration 100 of epoch 2 complete
Iteration 200 of epoch 2 complete
Epoch 2 complete! RMSE: 0.21317375591024756, Pearson: 0.17407713168949893
======= Epoch 3 =======
Iteration 100 of epoch 3 complete
Iteration 200 of epoch 3 complete
Epoch 3 complete! RMSE: 0.2287141541019082, Pearson: 0.15072615365604347
%% Cell type:code id: tags:
def writeScores(method_name,scores):
fn = "predictions.txt"
with open(fn, 'w') as output_file:
for idx,x in enumerate(scores):
%% Cell type:code id: tags:
test_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'test.ende.src', mt_file=PATH/'')
test_loader = DataLoader(test_set, batch_size=len(test_set))
model = QualityEstimation(hidden_dim=128)
print('Start testing...')
with torch.no_grad():
for token_ids, segment_ids, attn_masks in test_loader:
# Converting these to cuda tensors
token_ids, segment_ids, attn_masks =,,
# Obtaining scores from the model
qe_scores = model(token_ids, segment_ids, attn_masks)
print('Testing finished!')
qe_scores = [np.arctanh(scores[0]) for scores in qe_scores.detach().cpu().numpy()]
%% Output
Start testing...
Testing finished!
%% Cell type:code id: tags:
from google.colab import files
from zipfile import ZipFile
with ZipFile("","w") as newzip:
%% Output
