Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
N
NLP_CW
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Park, Se
NLP_CW
Commits
a5eed89b
Commit
a5eed89b
authored
5 years ago
by
Se Park
Browse files
Options
Downloads
Patches
Plain Diff
Latest BERT model
parent
04f49c4f
Branches
master
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
BERT_LSTM_FFNN.ipynb
+473
-0
473 additions, 0 deletions
BERT_LSTM_FFNN.ipynb
with
473 additions
and
0 deletions
BERT_LSTM_FFNN.ipynb
0 → 100644
+
473
−
0
View file @
a5eed89b
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "BERT_LSTM_FFNN.ipynb",
"provenance": [],
"collapsed_sections": [],
"machine_shape": "hm"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "egfpmtyFS1Oc",
"colab_type": "code",
"outputId": "acc87f28-e549-4557-a6a9-b9caddd65ac7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 417
}
},
"source": [
"! pip install transformers"
],
"execution_count": 65,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (2.5.1)\n",
"Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.38)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.28.1)\n",
"Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.11.15)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.85)\n",
"Requirement already satisfied: tokenizers==0.5.2 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.5.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.17.5)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.0)\n",
"Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n",
"Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2019.11.28)\n",
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n",
"Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)\n",
"Requirement already satisfied: botocore<1.15.0,>=1.14.15 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.14.15)\n",
"Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.4)\n",
"Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (0.15.2)\n",
"Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (2.6.1)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "PrlHtw8cJ13M",
"colab_type": "code",
"colab": {}
},
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import BertTokenizer, BertModel, BertConfig\n",
"from pathlib import Path\n",
"import os"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "O_S_EgpCJgxl",
"colab_type": "code",
"colab": {}
},
"source": [
"from os.path import exists\n",
"if not exists('ende_data.zip'):\n",
" !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d\n",
" !unzip ende_data.zip"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "FSHFGTl0Jo0h",
"colab_type": "code",
"colab": {}
},
"source": [
"class Data(object):\n",
" \"\"\"A single training/test example for the dataset.\"\"\"\n",
" def __init__(self, src, mt, score=None):\n",
" self.src = src\n",
" self.mt = mt\n",
" self.score = score\n",
"\n",
" def __str__(self):\n",
" return self.__repr__()\n",
"\n",
" def __repr__(self):\n",
" l = [\"src: {}\".format(self.src), \"mt: {}\".format(self.mt)]\n",
" if self.score is not None:\n",
" l.append(\"label: {}\".format(self.score))\n",
" return \", \".join(l)\n",
"\n",
"class LoadData(Dataset):\n",
"\n",
" def __init__(self, maxlen, src_file, mt_file, score_file=None):\n",
"\n",
" self.score_file = score_file\n",
" with open(src_file, 'r', encoding='utf-8') as f:\n",
" src_sentences = f.readlines()\n",
" with open(mt_file, 'r', encoding='utf-8') as f:\n",
" mt_sentences = f.readlines()\n",
" if self.score_file is not None:\n",
" with open(score_file, 'r', encoding='utf-8') as f:\n",
" scores = f.readlines()\n",
" \n",
" self.data = [Data(src=s.strip(), mt=m.strip(), score=float(h.strip()))\n",
" for s, m, h in zip(src_sentences, mt_sentences, scores)]\n",
" \n",
" else:\n",
" self.data = [Data(src=s.strip(), mt=m.strip())\n",
" for s, m in zip(src_sentences, mt_sentences)]\n",
"\n",
"\n",
" self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)\n",
" self.maxlen = maxlen\n",
"\n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, index):\n",
"\n",
" # Selecting the sentence and label at the specified index in the data frame\n",
" src = self.data[index].src\n",
" mt = self.data[index].mt\n",
" score = self.data[index].score\n",
"\n",
" # Preprocessing the text to be suitable for BERT\n",
" # Tokenize the sentence\n",
" src_tokens = self.tokenizer.tokenize(src)\n",
" mt_tokens = self.tokenizer.tokenize(mt)\n",
"\n",
" # Insering the CLS and SEP token in the beginning and end of the sentence\n",
" tokens = [\"[CLS]\"] + src_tokens + [\"[SEP]\"] + mt_tokens + [\"[SEP]\"]\n",
" # Obtaining the indices of the tokens in the BERT Vocabulary\n",
" tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)\n",
" segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)\n",
" attn_mask = [1]*len(tokens_ids)\n",
"\n",
" if len(tokens) < self.maxlen:\n",
" # Padding sentences\n",
" padding = [0] * (self.maxlen-len(tokens_ids))\n",
" tokens_ids += padding\n",
" segment_ids += padding\n",
" attn_mask += padding\n",
" else:\n",
" # Prunning the list to be of specified max length\n",
" tokens_ids = tokens_ids[:self.maxlen]\n",
" segment_ids = segment_ids[:self.maxlen]\n",
" attn_mask = attn_mask[:self.maxlen]\n",
"\n",
" # Converting the list to a pytorch tensor\n",
" tokens_ids = torch.tensor(tokens_ids, dtype=torch.long)\n",
" segment_ids = torch.tensor(segment_ids, dtype=torch.long)\n",
" attn_mask = torch.tensor(attn_mask, dtype=torch.long)\n",
" \n",
" if self.score_file is not None:\n",
" score = torch.tensor(score, dtype=torch.float)\n",
" score = torch.tanh(score)\n",
" return tokens_ids, attn_mask, segment_ids, score\n",
" return tokens_ids, attn_mask, segment_ids"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "NNg8VarnJuIn",
"colab_type": "code",
"colab": {}
},
"source": [
"class QualityEstimation(nn.Module):\n",
"\n",
" def __init__(self, hidden_dim):\n",
" super(QualityEstimation, self).__init__()\n",
" self.hidden_dim = hidden_dim\n",
"\n",
" # Instantiating BERT model object\n",
" config = BertConfig()\n",
" self.bert = BertModel(config).from_pretrained('bert-base-multilingual-cased')\n",
" self.dropout = nn.Dropout(0.25)\n",
"\n",
" # LSTM and classification layers\n",
" self.lstm = nn.LSTM(input_size=768, hidden_size=self.hidden_dim,\n",
" num_layers=1, batch_first=True,\n",
" dropout=0, bidirectional=False)\n",
" self.fc1 = nn.Linear(self.hidden_dim, 1)\n",
" nn.init.kaiming_normal_(self.fc1.weight)\n",
"\n",
" def forward(self, token_ids, segment_ids=None, attention_mask=None):\n",
"\n",
" encoded_layers, _ = self.bert(input_ids=token_ids, token_type_ids=segment_ids, attention_mask=attention_mask)\n",
" encoded_layers = self.dropout(encoded_layers)\n",
" output, _ = self.lstm(encoded_layers)\n",
" qe_scores = self.fc1(output[:,-1,:])\n",
" qe_scores = torch.tanh(qe_scores)\n",
"\n",
" return qe_scores"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "wOHtaubKJvGX",
"colab_type": "code",
"colab": {}
},
"source": [
"def set_seed(seed=123):\n",
" torch.manual_seed(seed)\n",
" torch.cuda.manual_seed(seed)\n",
" np.random.seed(seed)\n",
"\n",
"def evaluate(model, loss_fn, dataloader, device):\n",
" model.eval()\n",
" eval_loss = 0\n",
" pred, ref = np.array([]), np.array([])\n",
" count = 0\n",
" with torch.no_grad():\n",
" for token_ids, segment_ids, attn_masks, labels in dataloader:\n",
" token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)\n",
" qe_scores = model(token_ids, segment_ids, attn_masks)\n",
" loss = loss_fn(qe_scores.view(-1), labels.view(-1))\n",
"\n",
" qe_scores = qe_scores.detach().cpu().numpy()\n",
" qe_scores = qe_scores.reshape((qe_scores.shape[0],))\n",
" labels = labels.to('cpu').numpy()\n",
"\n",
" pred = np.concatenate((pred, qe_scores))\n",
" ref = np.concatenate((ref, labels))\n",
"\n",
" eval_loss += loss.item()\n",
" count += 1\n",
"\n",
" eval_loss = eval_loss / count\n",
" pearson = np.corrcoef(pred, ref)[0, 1]\n",
"\n",
" return eval_loss, pearson\n",
"\n",
"def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler=None):\n",
"\n",
" best_pearson = -float('inf')\n",
" for ep in range(num_epoch):\n",
" print('======= Epoch {:} ======='.format(ep))\n",
" for it, (token_ids, segment_ids, attn_masks, labels) in enumerate(train_loader):\n",
" model.train()\n",
" # Clear gradients\n",
" optimizer.zero_grad()\n",
" # Converting these to cuda tensors\n",
" token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)\n",
" # Obtaining scores from the model\n",
" qe_scores = model(token_ids, segment_ids, attn_masks)\n",
" # Computing loss\n",
" loss = loss_fn(qe_scores.view(-1), labels.view(-1))\n",
" # Backpropagating the gradients\n",
" loss.backward()\n",
" nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
" # Optimization step\n",
" optimizer.step()\n",
" if it % 100 == 0 and not it == 0:\n",
" print(\"Iteration {} of epoch {} complete\".format(it, ep))\n",
" \n",
" rmse, pearson = evaluate(model, loss_fn, val_loader, device)\n",
" print(\"Epoch {} complete! RMSE: {}, Pearson: {}\".format(ep, rmse, pearson))\n",
" if pearson > best_pearson:\n",
" print(\"Best Pearson improved from {} to {}, saving model...\".format(best_pearson, pearson))\n",
" best_pearson = pearson\n",
" torch.save(model.state_dict(), 'modelNLP.pt')\n",
" \n",
" if scheduler is not None:\n",
" scheduler.step()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "wVYIR0F7bd26",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"outputId": "abb0c563-9971-4d66-c1c3-747092a7bd92"
},
"source": [
"PATH = Path(\"./\")\n",
"\n",
"use_cuda = torch.cuda.is_available()\n",
"device = torch.device('cuda' if use_cuda else 'cpu')\n",
"print(\"Using GPU: {}\".format(use_cuda))\n",
"\n",
"set_seed()\n",
"model = QualityEstimation(hidden_dim=128)\n",
"model.cuda()\n",
"\n",
"loss_fn = nn.MSELoss()\n",
"optimizer = optim.AdamW(model.parameters(), lr=0.00002)\n",
"scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)\n",
"\n",
"MAX_LEN = 80\n",
"train_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'train.ende.src', mt_file=PATH/'train.ende.mt', score_file=PATH/'train.ende.scores')\n",
"val_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'dev.ende.src', mt_file=PATH/'dev.ende.mt', score_file=PATH/'dev.ende.scores')\n",
"train_loader = DataLoader(train_set, batch_size=32)\n",
"val_loader = DataLoader(val_set, batch_size=32)\n",
"\n",
"num_epoch = 4\n",
"train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler)"
],
"execution_count": 71,
"outputs": [
{
"output_type": "stream",
"text": [
"Using GPU: True\n",
"======= Epoch 0 =======\n",
"Iteration 100 of epoch 0 complete\n",
"Iteration 200 of epoch 0 complete\n",
"Epoch 0 complete! RMSE: 0.20935339806601405, Pearson: 0.14687585604000064\n",
"Best Pearson improved from -inf to 0.14687585604000064, saving model...\n",
"======= Epoch 1 =======\n",
"Iteration 100 of epoch 1 complete\n",
"Iteration 200 of epoch 1 complete\n",
"Epoch 1 complete! RMSE: 0.20592319779098034, Pearson: 0.209848575646105\n",
"Best Pearson improved from 0.14687585604000064 to 0.209848575646105, saving model...\n",
"======= Epoch 2 =======\n",
"Iteration 100 of epoch 2 complete\n",
"Iteration 200 of epoch 2 complete\n",
"Epoch 2 complete! RMSE: 0.21317375591024756, Pearson: 0.17407713168949893\n",
"======= Epoch 3 =======\n",
"Iteration 100 of epoch 3 complete\n",
"Iteration 200 of epoch 3 complete\n",
"Epoch 3 complete! RMSE: 0.2287141541019082, Pearson: 0.15072615365604347\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "xS4fxPoSe7Kg",
"colab_type": "code",
"colab": {}
},
"source": [
"def writeScores(method_name,scores):\n",
" fn = \"predictions.txt\"\n",
" print(\"\")\n",
" with open(fn, 'w') as output_file:\n",
" for idx,x in enumerate(scores):\n",
" output_file.write(f\"{x}\\n\")"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "vVTcKiCQYpFE",
"colab_type": "code",
"outputId": "eaca927f-accd-4350-8f11-a6496aa31f18",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
}
},
"source": [
"test_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'test.ende.src', mt_file=PATH/'test.ende.mt')\n",
"test_loader = DataLoader(test_set, batch_size=len(test_set))\n",
"\n",
"model = QualityEstimation(hidden_dim=128)\n",
"model.load_state_dict(torch.load('modelNLP.pt'))\n",
"model.cuda()\n",
" \n",
"print('Start testing...')\n",
"\n",
"model.eval()\n",
"with torch.no_grad():\n",
" for token_ids, segment_ids, attn_masks in test_loader:\n",
" # Converting these to cuda tensors\n",
" token_ids, segment_ids, attn_masks = token_ids.to(device), segment_ids.to(device), attn_masks.to(device)\n",
" # Obtaining scores from the model\n",
" qe_scores = model(token_ids, segment_ids, attn_masks)\n",
"\n",
"print('Testing finished!')\n",
"\n",
"qe_scores = [np.arctanh(scores[0]) for scores in qe_scores.detach().cpu().numpy()]"
],
"execution_count": 73,
"outputs": [
{
"output_type": "stream",
"text": [
"Start testing...\n",
"Testing finished!\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "lIHc5ZUwl2hx",
"colab_type": "code",
"outputId": "b9f99a1b-5114-4d37-f826-d743aafe4b2a",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"source": [
"from google.colab import files\n",
"from zipfile import ZipFile\n",
"\n",
"writeScores(\"BERT\",qe_scores)\n",
"\n",
"with ZipFile(\"en-de_svr.zip\",\"w\") as newzip:\n",
"\tnewzip.write(\"predictions.txt\")\n",
" \n",
"files.download('en-de_svr.zip') "
],
"execution_count": 75,
"outputs": [
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
}
]
}
\ No newline at end of file
%% Cell type:code id: tags:
```
! pip install transformers
```
%% Output
Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (2.5.1)
Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.38)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)
Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.28.1)
Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.11.15)
Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.85)
Requirement already satisfied: tokenizers==0.5.2 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.5.2)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.17.5)
Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)
Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.0)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2019.11.28)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)
Requirement already satisfied: botocore<1.15.0,>=1.14.15 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.14.15)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.4)
Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (0.15.2)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (2.6.1)
%% Cell type:code id: tags:
```
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
from pathlib import Path
import os
```
%% Cell type:code id: tags:
```
from os.path import exists
if not exists('ende_data.zip'):
!wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
!unzip ende_data.zip
```
%% Cell type:code id: tags:
```
class Data(object):
"""A single training/test example for the dataset."""
def __init__(self, src, mt, score=None):
self.src = src
self.mt = mt
self.score = score
def __str__(self):
return self.__repr__()
def __repr__(self):
l = ["src: {}".format(self.src), "mt: {}".format(self.mt)]
if self.score is not None:
l.append("label: {}".format(self.score))
return ", ".join(l)
class LoadData(Dataset):
def __init__(self, maxlen, src_file, mt_file, score_file=None):
self.score_file = score_file
with open(src_file, 'r', encoding='utf-8') as f:
src_sentences = f.readlines()
with open(mt_file, 'r', encoding='utf-8') as f:
mt_sentences = f.readlines()
if self.score_file is not None:
with open(score_file, 'r', encoding='utf-8') as f:
scores = f.readlines()
self.data = [Data(src=s.strip(), mt=m.strip(), score=float(h.strip()))
for s, m, h in zip(src_sentences, mt_sentences, scores)]
else:
self.data = [Data(src=s.strip(), mt=m.strip())
for s, m in zip(src_sentences, mt_sentences)]
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
self.maxlen = maxlen
def __len__(self):
return len(self.data)
def __getitem__(self, index):
# Selecting the sentence and label at the specified index in the data frame
src = self.data[index].src
mt = self.data[index].mt
score = self.data[index].score
# Preprocessing the text to be suitable for BERT
# Tokenize the sentence
src_tokens = self.tokenizer.tokenize(src)
mt_tokens = self.tokenizer.tokenize(mt)
# Insering the CLS and SEP token in the beginning and end of the sentence
tokens = ["[CLS]"] + src_tokens + ["[SEP]"] + mt_tokens + ["[SEP]"]
# Obtaining the indices of the tokens in the BERT Vocabulary
tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)
attn_mask = [1]*len(tokens_ids)
if len(tokens) < self.maxlen:
# Padding sentences
padding = [0] * (self.maxlen-len(tokens_ids))
tokens_ids += padding
segment_ids += padding
attn_mask += padding
else:
# Prunning the list to be of specified max length
tokens_ids = tokens_ids[:self.maxlen]
segment_ids = segment_ids[:self.maxlen]
attn_mask = attn_mask[:self.maxlen]
# Converting the list to a pytorch tensor
tokens_ids = torch.tensor(tokens_ids, dtype=torch.long)
segment_ids = torch.tensor(segment_ids, dtype=torch.long)
attn_mask = torch.tensor(attn_mask, dtype=torch.long)
if self.score_file is not None:
score = torch.tensor(score, dtype=torch.float)
score = torch.tanh(score)
return tokens_ids, attn_mask, segment_ids, score
return tokens_ids, attn_mask, segment_ids
```
%% Cell type:code id: tags:
```
class QualityEstimation(nn.Module):
def __init__(self, hidden_dim):
super(QualityEstimation, self).__init__()
self.hidden_dim = hidden_dim
# Instantiating BERT model object
config = BertConfig()
self.bert = BertModel(config).from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(0.25)
# LSTM and classification layers
self.lstm = nn.LSTM(input_size=768, hidden_size=self.hidden_dim,
num_layers=1, batch_first=True,
dropout=0, bidirectional=False)
self.fc1 = nn.Linear(self.hidden_dim, 1)
nn.init.kaiming_normal_(self.fc1.weight)
def forward(self, token_ids, segment_ids=None, attention_mask=None):
encoded_layers, _ = self.bert(input_ids=token_ids, token_type_ids=segment_ids, attention_mask=attention_mask)
encoded_layers = self.dropout(encoded_layers)
output, _ = self.lstm(encoded_layers)
qe_scores = self.fc1(output[:,-1,:])
qe_scores = torch.tanh(qe_scores)
return qe_scores
```
%% Cell type:code id: tags:
```
def set_seed(seed=123):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
def evaluate(model, loss_fn, dataloader, device):
model.eval()
eval_loss = 0
pred, ref = np.array([]), np.array([])
count = 0
with torch.no_grad():
for token_ids, segment_ids, attn_masks, labels in dataloader:
token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)
qe_scores = model(token_ids, segment_ids, attn_masks)
loss = loss_fn(qe_scores.view(-1), labels.view(-1))
qe_scores = qe_scores.detach().cpu().numpy()
qe_scores = qe_scores.reshape((qe_scores.shape[0],))
labels = labels.to('cpu').numpy()
pred = np.concatenate((pred, qe_scores))
ref = np.concatenate((ref, labels))
eval_loss += loss.item()
count += 1
eval_loss = eval_loss / count
pearson = np.corrcoef(pred, ref)[0, 1]
return eval_loss, pearson
def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler=None):
best_pearson = -float('inf')
for ep in range(num_epoch):
print('======= Epoch {:} ======='.format(ep))
for it, (token_ids, segment_ids, attn_masks, labels) in enumerate(train_loader):
model.train()
# Clear gradients
optimizer.zero_grad()
# Converting these to cuda tensors
token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)
# Obtaining scores from the model
qe_scores = model(token_ids, segment_ids, attn_masks)
# Computing loss
loss = loss_fn(qe_scores.view(-1), labels.view(-1))
# Backpropagating the gradients
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Optimization step
optimizer.step()
if it % 100 == 0 and not it == 0:
print("Iteration {} of epoch {} complete".format(it, ep))
rmse, pearson = evaluate(model, loss_fn, val_loader, device)
print("Epoch {} complete! RMSE: {}, Pearson: {}".format(ep, rmse, pearson))
if pearson > best_pearson:
print("Best Pearson improved from {} to {}, saving model...".format(best_pearson, pearson))
best_pearson = pearson
torch.save(model.state_dict(), 'modelNLP.pt')
if scheduler is not None:
scheduler.step()
```
%% Cell type:code id: tags:
```
PATH = Path("./")
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
print("Using GPU: {}".format(use_cuda))
set_seed()
model = QualityEstimation(hidden_dim=128)
model.cuda()
loss_fn = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.00002)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)
MAX_LEN = 80
train_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'train.ende.src', mt_file=PATH/'train.ende.mt', score_file=PATH/'train.ende.scores')
val_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'dev.ende.src', mt_file=PATH/'dev.ende.mt', score_file=PATH/'dev.ende.scores')
train_loader = DataLoader(train_set, batch_size=32)
val_loader = DataLoader(val_set, batch_size=32)
num_epoch = 4
train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler)
```
%% Output
Using GPU: True
======= Epoch 0 =======
Iteration 100 of epoch 0 complete
Iteration 200 of epoch 0 complete
Epoch 0 complete! RMSE: 0.20935339806601405, Pearson: 0.14687585604000064
Best Pearson improved from -inf to 0.14687585604000064, saving model...
======= Epoch 1 =======
Iteration 100 of epoch 1 complete
Iteration 200 of epoch 1 complete
Epoch 1 complete! RMSE: 0.20592319779098034, Pearson: 0.209848575646105
Best Pearson improved from 0.14687585604000064 to 0.209848575646105, saving model...
======= Epoch 2 =======
Iteration 100 of epoch 2 complete
Iteration 200 of epoch 2 complete
Epoch 2 complete! RMSE: 0.21317375591024756, Pearson: 0.17407713168949893
======= Epoch 3 =======
Iteration 100 of epoch 3 complete
Iteration 200 of epoch 3 complete
Epoch 3 complete! RMSE: 0.2287141541019082, Pearson: 0.15072615365604347
%% Cell type:code id: tags:
```
def writeScores(method_name,scores):
fn = "predictions.txt"
print("")
with open(fn, 'w') as output_file:
for idx,x in enumerate(scores):
output_file.write(f"{x}\n")
```
%% Cell type:code id: tags:
```
test_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'test.ende.src', mt_file=PATH/'test.ende.mt')
test_loader = DataLoader(test_set, batch_size=len(test_set))
model = QualityEstimation(hidden_dim=128)
model.load_state_dict(torch.load('modelNLP.pt'))
model.cuda()
print('Start testing...')
model.eval()
with torch.no_grad():
for token_ids, segment_ids, attn_masks in test_loader:
# Converting these to cuda tensors
token_ids, segment_ids, attn_masks = token_ids.to(device), segment_ids.to(device), attn_masks.to(device)
# Obtaining scores from the model
qe_scores = model(token_ids, segment_ids, attn_masks)
print('Testing finished!')
qe_scores = [np.arctanh(scores[0]) for scores in qe_scores.detach().cpu().numpy()]
```
%% Output
Start testing...
Testing finished!
%% Cell type:code id: tags:
```
from google.colab import files
from zipfile import ZipFile
writeScores("BERT",qe_scores)
with ZipFile("en-de_svr.zip","w") as newzip:
newzip.write("predictions.txt")
files.download('en-de_svr.zip')
```
%% Output
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment