diff --git a/BERT_LSTM_FFNN.ipynb b/BERT_LSTM_FFNN.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..eba32ca7a272a886b4a6813db8439a540a06b8c7 --- /dev/null +++ b/BERT_LSTM_FFNN.ipynb @@ -0,0 +1,473 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "BERT_LSTM_FFNN.ipynb", + "provenance": [], + "collapsed_sections": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "egfpmtyFS1Oc", + "colab_type": "code", + "outputId": "acc87f28-e549-4557-a6a9-b9caddd65ac7", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 417 + } + }, + "source": [ + "! pip install transformers" + ], + "execution_count": 65, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (2.5.1)\n", + "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.38)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.28.1)\n", + "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.11.15)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.85)\n", + "Requirement already satisfied: tokenizers==0.5.2 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.5.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.17.5)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.0)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2019.11.28)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)\n", + "Requirement already satisfied: botocore<1.15.0,>=1.14.15 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.14.15)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.4)\n", + "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (0.15.2)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (2.6.1)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PrlHtw8cJ13M", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torch.optim as optim\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import BertTokenizer, BertModel, BertConfig\n", + "from pathlib import Path\n", + "import os" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "O_S_EgpCJgxl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from os.path import exists\n", + "if not exists('ende_data.zip'):\n", + " !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d\n", + " !unzip ende_data.zip" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "FSHFGTl0Jo0h", + "colab_type": "code", + "colab": {} + }, + "source": [ + "class Data(object):\n", + " \"\"\"A single training/test example for the dataset.\"\"\"\n", + " def __init__(self, src, mt, score=None):\n", + " self.src = src\n", + " self.mt = mt\n", + " self.score = score\n", + "\n", + " def __str__(self):\n", + " return self.__repr__()\n", + "\n", + " def __repr__(self):\n", + " l = [\"src: {}\".format(self.src), \"mt: {}\".format(self.mt)]\n", + " if self.score is not None:\n", + " l.append(\"label: {}\".format(self.score))\n", + " return \", \".join(l)\n", + "\n", + "class LoadData(Dataset):\n", + "\n", + " def __init__(self, maxlen, src_file, mt_file, score_file=None):\n", + "\n", + " self.score_file = score_file\n", + " with open(src_file, 'r', encoding='utf-8') as f:\n", + " src_sentences = f.readlines()\n", + " with open(mt_file, 'r', encoding='utf-8') as f:\n", + " mt_sentences = f.readlines()\n", + " if self.score_file is not None:\n", + " with open(score_file, 'r', encoding='utf-8') as f:\n", + " scores = f.readlines()\n", + " \n", + " self.data = [Data(src=s.strip(), mt=m.strip(), score=float(h.strip()))\n", + " for s, m, h in zip(src_sentences, mt_sentences, scores)]\n", + " \n", + " else:\n", + " self.data = [Data(src=s.strip(), mt=m.strip())\n", + " for s, m in zip(src_sentences, mt_sentences)]\n", + "\n", + "\n", + " self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)\n", + " self.maxlen = maxlen\n", + "\n", + " def __len__(self):\n", + " return len(self.data)\n", + "\n", + " def __getitem__(self, index):\n", + "\n", + " # Selecting the sentence and label at the specified index in the data frame\n", + " src = self.data[index].src\n", + " mt = self.data[index].mt\n", + " score = self.data[index].score\n", + "\n", + " # Preprocessing the text to be suitable for BERT\n", + " # Tokenize the sentence\n", + " src_tokens = self.tokenizer.tokenize(src)\n", + " mt_tokens = self.tokenizer.tokenize(mt)\n", + "\n", + " # Insering the CLS and SEP token in the beginning and end of the sentence\n", + " tokens = [\"[CLS]\"] + src_tokens + [\"[SEP]\"] + mt_tokens + [\"[SEP]\"]\n", + " # Obtaining the indices of the tokens in the BERT Vocabulary\n", + " tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)\n", + " segment_ids = [0]*(len(src_tokens)+2) + [1]*(len(mt_tokens)+1)\n", + " attn_mask = [1]*len(tokens_ids)\n", + "\n", + " if len(tokens) < self.maxlen:\n", + " # Padding sentences\n", + " padding = [0] * (self.maxlen-len(tokens_ids))\n", + " tokens_ids += padding\n", + " segment_ids += padding\n", + " attn_mask += padding\n", + " else:\n", + " # Prunning the list to be of specified max length\n", + " tokens_ids = tokens_ids[:self.maxlen]\n", + " segment_ids = segment_ids[:self.maxlen]\n", + " attn_mask = attn_mask[:self.maxlen]\n", + "\n", + " # Converting the list to a pytorch tensor\n", + " tokens_ids = torch.tensor(tokens_ids, dtype=torch.long)\n", + " segment_ids = torch.tensor(segment_ids, dtype=torch.long)\n", + " attn_mask = torch.tensor(attn_mask, dtype=torch.long)\n", + " \n", + " if self.score_file is not None:\n", + " score = torch.tensor(score, dtype=torch.float)\n", + " score = torch.tanh(score)\n", + " return tokens_ids, attn_mask, segment_ids, score\n", + " return tokens_ids, attn_mask, segment_ids" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NNg8VarnJuIn", + "colab_type": "code", + "colab": {} + }, + "source": [ + "class QualityEstimation(nn.Module):\n", + "\n", + " def __init__(self, hidden_dim):\n", + " super(QualityEstimation, self).__init__()\n", + " self.hidden_dim = hidden_dim\n", + "\n", + " # Instantiating BERT model object\n", + " config = BertConfig()\n", + " self.bert = BertModel(config).from_pretrained('bert-base-multilingual-cased')\n", + " self.dropout = nn.Dropout(0.25)\n", + "\n", + " # LSTM and classification layers\n", + " self.lstm = nn.LSTM(input_size=768, hidden_size=self.hidden_dim,\n", + " num_layers=1, batch_first=True,\n", + " dropout=0, bidirectional=False)\n", + " self.fc1 = nn.Linear(self.hidden_dim, 1)\n", + " nn.init.kaiming_normal_(self.fc1.weight)\n", + "\n", + " def forward(self, token_ids, segment_ids=None, attention_mask=None):\n", + "\n", + " encoded_layers, _ = self.bert(input_ids=token_ids, token_type_ids=segment_ids, attention_mask=attention_mask)\n", + " encoded_layers = self.dropout(encoded_layers)\n", + " output, _ = self.lstm(encoded_layers)\n", + " qe_scores = self.fc1(output[:,-1,:])\n", + " qe_scores = torch.tanh(qe_scores)\n", + "\n", + " return qe_scores" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wOHtaubKJvGX", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def set_seed(seed=123):\n", + " torch.manual_seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " np.random.seed(seed)\n", + "\n", + "def evaluate(model, loss_fn, dataloader, device):\n", + " model.eval()\n", + " eval_loss = 0\n", + " pred, ref = np.array([]), np.array([])\n", + " count = 0\n", + " with torch.no_grad():\n", + " for token_ids, segment_ids, attn_masks, labels in dataloader:\n", + " token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)\n", + " qe_scores = model(token_ids, segment_ids, attn_masks)\n", + " loss = loss_fn(qe_scores.view(-1), labels.view(-1))\n", + "\n", + " qe_scores = qe_scores.detach().cpu().numpy()\n", + " qe_scores = qe_scores.reshape((qe_scores.shape[0],))\n", + " labels = labels.to('cpu').numpy()\n", + "\n", + " pred = np.concatenate((pred, qe_scores))\n", + " ref = np.concatenate((ref, labels))\n", + "\n", + " eval_loss += loss.item()\n", + " count += 1\n", + "\n", + " eval_loss = eval_loss / count\n", + " pearson = np.corrcoef(pred, ref)[0, 1]\n", + "\n", + " return eval_loss, pearson\n", + "\n", + "def train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler=None):\n", + "\n", + " best_pearson = -float('inf')\n", + " for ep in range(num_epoch):\n", + " print('======= Epoch {:} ======='.format(ep))\n", + " for it, (token_ids, segment_ids, attn_masks, labels) in enumerate(train_loader):\n", + " model.train()\n", + " # Clear gradients\n", + " optimizer.zero_grad()\n", + " # Converting these to cuda tensors\n", + " token_ids, segment_ids, attn_masks, labels = token_ids.to(device), segment_ids.to(device), attn_masks.to(device), labels.to(device)\n", + " # Obtaining scores from the model\n", + " qe_scores = model(token_ids, segment_ids, attn_masks)\n", + " # Computing loss\n", + " loss = loss_fn(qe_scores.view(-1), labels.view(-1))\n", + " # Backpropagating the gradients\n", + " loss.backward()\n", + " nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", + " # Optimization step\n", + " optimizer.step()\n", + " if it % 100 == 0 and not it == 0:\n", + " print(\"Iteration {} of epoch {} complete\".format(it, ep))\n", + " \n", + " rmse, pearson = evaluate(model, loss_fn, val_loader, device)\n", + " print(\"Epoch {} complete! RMSE: {}, Pearson: {}\".format(ep, rmse, pearson))\n", + " if pearson > best_pearson:\n", + " print(\"Best Pearson improved from {} to {}, saving model...\".format(best_pearson, pearson))\n", + " best_pearson = pearson\n", + " torch.save(model.state_dict(), 'modelNLP.pt')\n", + " \n", + " if scheduler is not None:\n", + " scheduler.step()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wVYIR0F7bd26", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "outputId": "abb0c563-9971-4d66-c1c3-747092a7bd92" + }, + "source": [ + "PATH = Path(\"./\")\n", + "\n", + "use_cuda = torch.cuda.is_available()\n", + "device = torch.device('cuda' if use_cuda else 'cpu')\n", + "print(\"Using GPU: {}\".format(use_cuda))\n", + "\n", + "set_seed()\n", + "model = QualityEstimation(hidden_dim=128)\n", + "model.cuda()\n", + "\n", + "loss_fn = nn.MSELoss()\n", + "optimizer = optim.AdamW(model.parameters(), lr=0.00002)\n", + "scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)\n", + "\n", + "MAX_LEN = 80\n", + "train_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'train.ende.src', mt_file=PATH/'train.ende.mt', score_file=PATH/'train.ende.scores')\n", + "val_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'dev.ende.src', mt_file=PATH/'dev.ende.mt', score_file=PATH/'dev.ende.scores')\n", + "train_loader = DataLoader(train_set, batch_size=32)\n", + "val_loader = DataLoader(val_set, batch_size=32)\n", + "\n", + "num_epoch = 4\n", + "train(model, loss_fn, optimizer, train_loader, val_loader, num_epoch, device, scheduler)" + ], + "execution_count": 71, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Using GPU: True\n", + "======= Epoch 0 =======\n", + "Iteration 100 of epoch 0 complete\n", + "Iteration 200 of epoch 0 complete\n", + "Epoch 0 complete! RMSE: 0.20935339806601405, Pearson: 0.14687585604000064\n", + "Best Pearson improved from -inf to 0.14687585604000064, saving model...\n", + "======= Epoch 1 =======\n", + "Iteration 100 of epoch 1 complete\n", + "Iteration 200 of epoch 1 complete\n", + "Epoch 1 complete! RMSE: 0.20592319779098034, Pearson: 0.209848575646105\n", + "Best Pearson improved from 0.14687585604000064 to 0.209848575646105, saving model...\n", + "======= Epoch 2 =======\n", + "Iteration 100 of epoch 2 complete\n", + "Iteration 200 of epoch 2 complete\n", + "Epoch 2 complete! RMSE: 0.21317375591024756, Pearson: 0.17407713168949893\n", + "======= Epoch 3 =======\n", + "Iteration 100 of epoch 3 complete\n", + "Iteration 200 of epoch 3 complete\n", + "Epoch 3 complete! RMSE: 0.2287141541019082, Pearson: 0.15072615365604347\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xS4fxPoSe7Kg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "def writeScores(method_name,scores):\n", + " fn = \"predictions.txt\"\n", + " print(\"\")\n", + " with open(fn, 'w') as output_file:\n", + " for idx,x in enumerate(scores):\n", + " output_file.write(f\"{x}\\n\")" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vVTcKiCQYpFE", + "colab_type": "code", + "outputId": "eaca927f-accd-4350-8f11-a6496aa31f18", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + } + }, + "source": [ + "test_set = LoadData(maxlen=MAX_LEN, src_file=PATH/'test.ende.src', mt_file=PATH/'test.ende.mt')\n", + "test_loader = DataLoader(test_set, batch_size=len(test_set))\n", + "\n", + "model = QualityEstimation(hidden_dim=128)\n", + "model.load_state_dict(torch.load('modelNLP.pt'))\n", + "model.cuda()\n", + " \n", + "print('Start testing...')\n", + "\n", + "model.eval()\n", + "with torch.no_grad():\n", + " for token_ids, segment_ids, attn_masks in test_loader:\n", + " # Converting these to cuda tensors\n", + " token_ids, segment_ids, attn_masks = token_ids.to(device), segment_ids.to(device), attn_masks.to(device)\n", + " # Obtaining scores from the model\n", + " qe_scores = model(token_ids, segment_ids, attn_masks)\n", + "\n", + "print('Testing finished!')\n", + "\n", + "qe_scores = [np.arctanh(scores[0]) for scores in qe_scores.detach().cpu().numpy()]" + ], + "execution_count": 73, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Start testing...\n", + "Testing finished!\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lIHc5ZUwl2hx", + "colab_type": "code", + "outputId": "b9f99a1b-5114-4d37-f826-d743aafe4b2a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + } + }, + "source": [ + "from google.colab import files\n", + "from zipfile import ZipFile\n", + "\n", + "writeScores(\"BERT\",qe_scores)\n", + "\n", + "with ZipFile(\"en-de_svr.zip\",\"w\") as newzip:\n", + "\tnewzip.write(\"predictions.txt\")\n", + " \n", + "files.download('en-de_svr.zip') " + ], + "execution_count": 75, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file