Commit 956b2e9e authored by Wang, Andy's avatar Wang, Andy

Add preprocessing file with comments

parent 1e78d04c
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Preprocessing.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyNJs6YDw8Xd5tURu+80y4cI"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"e050-vUr6trI","colab_type":"code","outputId":"55e9b1da-3e1e-4752-8e14-0c0d38b9712d","executionInfo":{"status":"ok","timestamp":1582493541699,"user_tz":0,"elapsed":7869,"user":{"displayName":"Andy Wang","photoUrl":"","userId":"02776860930356410397"}},"colab":{"base_uri":"https://localhost:8080/","height":462}},"source":["# Download and unzip the data\n","from os.path import exists\n","if not exists('ende_data.zip'):\n"," !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d\n"," !unzip ende_data.zip"],"execution_count":1,"outputs":[{"output_type":"stream","text":["--2020-02-23 21:32:16-- https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d\n","Resolving competitions.codalab.org (competitions.codalab.org)... 129.175.22.230\n","Connecting to competitions.codalab.org (competitions.codalab.org)|129.175.22.230|:443... connected.\n","HTTP request sent, awaiting response... 302 FOUND\n","Location: https://newcodalab.lri.fr/prod-private/dataset_data_file/None/104ea/en-de.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=7be81a201867dad4963bb34c7b1a9911bc37ba50b75f7a80dfc01838ff8ac953&X-Amz-Date=20200223T213217Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20200223%2Fnewcodalab%2Fs3%2Faws4_request [following]\n","--2020-02-23 21:32:17-- https://newcodalab.lri.fr/prod-private/dataset_data_file/None/104ea/en-de.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=7be81a201867dad4963bb34c7b1a9911bc37ba50b75f7a80dfc01838ff8ac953&X-Amz-Date=20200223T213217Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20200223%2Fnewcodalab%2Fs3%2Faws4_request\n","Resolving newcodalab.lri.fr (newcodalab.lri.fr)... 129.175.15.11\n","Connecting to newcodalab.lri.fr (newcodalab.lri.fr)|129.175.15.11|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 864010 (844K) [application/zip]\n","Saving to: ‘ende_data.zip’\n","\n","ende_data.zip 100%[===================>] 843.76K 547KB/s in 1.5s \n","\n","2020-02-23 21:32:20 (547 KB/s) - ‘ende_data.zip’ saved [864010/864010]\n","\n","Archive: ende_data.zip\n"," inflating: dev.ende.mt \n"," inflating: dev.ende.scores \n"," inflating: dev.ende.src \n"," inflating: test.ende.mt \n"," inflating: test.ende.src \n"," inflating: train.ende.mt \n"," inflating: train.ende.scores \n"," inflating: train.ende.src \n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"nE0SFuUa6Myu","colab_type":"code","outputId":"495cc0b8-79cf-481a-98d4-56c03aaa2776","executionInfo":{"status":"ok","timestamp":1582493812526,"user_tz":0,"elapsed":278683,"user":{"displayName":"Andy Wang","photoUrl":"","userId":"02776860930356410397"}},"colab":{"base_uri":"https://localhost:8080/","height":612}},"source":["# Import libraries for preprocessing\n","import spacy\n","import numpy as np\n","import string\n","import pickle\n","from nltk import download\n","from nltk.corpus import stopwords\n","\n","\n","# For colab\n","!spacy download de_core_news_md\n","!spacy link de_core_news_md de300\n","!spacy download en_core_web_lg\n","!spacy link en_core_web_lg en300\n","\n","# downloading stopwords from the nltk package, comment out after file has been loaded once\n","download('stopwords')\n"],"execution_count":2,"outputs":[{"output_type":"stream","text":["Collecting de_core_news_md==2.1.0\n","\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_md-2.1.0/de_core_news_md-2.1.0.tar.gz (220.8MB)\n","\u001b[K |████████████████████████████████| 220.8MB 9.1MB/s \n","\u001b[?25hBuilding wheels for collected packages: de-core-news-md\n"," Building wheel for de-core-news-md (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for de-core-news-md: filename=de_core_news_md-2.1.0-cp36-none-any.whl size=224546880 sha256=355da410ccd1e5f745af874375ca2876d9221e1cdf4bfee365187154cff33e9f\n"," Stored in directory: /tmp/pip-ephem-wheel-cache-i174emb9/wheels/44/34/f1/31d4b0fa32008c09695ccb180865f196ecd9d512c146f99749\n","Successfully built de-core-news-md\n","Installing collected packages: de-core-news-md\n","Successfully installed de-core-news-md-2.1.0\n","\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('de_core_news_md')\n","\u001b[38;5;2m✔ Linking successful\u001b[0m\n","/usr/local/lib/python3.6/dist-packages/de_core_news_md -->\n","/usr/local/lib/python3.6/dist-packages/spacy/data/de300\n","You can now load the model via spacy.load('de300')\n","Collecting en_core_web_lg==2.1.0\n","\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)\n","\u001b[K |████████████████████████████████| 826.9MB 1.1MB/s \n","\u001b[?25hBuilding wheels for collected packages: en-core-web-lg\n"," Building wheel for en-core-web-lg (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for en-core-web-lg: filename=en_core_web_lg-2.1.0-cp36-none-any.whl size=828255078 sha256=00c04cc2abbce777bf561be4d5eb0faf30e6cd6c4b6565e40950a864cc1ee3f3\n"," Stored in directory: /tmp/pip-ephem-wheel-cache-li76o1he/wheels/b4/d7/70/426d313a459f82ed5e06cc36a50e2bb2f0ec5cb31d8e0bdf09\n","Successfully built en-core-web-lg\n","Installing collected packages: en-core-web-lg\n","Successfully installed en-core-web-lg-2.1.0\n","\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('en_core_web_lg')\n","\u001b[38;5;2m✔ Linking successful\u001b[0m\n","/usr/local/lib/python3.6/dist-packages/en_core_web_lg -->\n","/usr/local/lib/python3.6/dist-packages/spacy/data/en300\n","You can now load the model via spacy.load('en300')\n","[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Unzipping corpora/stopwords.zip.\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{"tags":[]},"execution_count":2}]},{"cell_type":"markdown","metadata":{"id":"UhjZ9o1P6Q2Y","colab_type":"text"},"source":[""]},{"cell_type":"code","metadata":{"id":"XEJrPju-6URD","colab_type":"code","colab":{}},"source":["# Load into memory\n","spacy_en = spacy.load('en300')\n","spacy_de = spacy.load('de300')\n","\n","# Stop words from nltk\n","stop_words_en = set(stopwords.words('english'))\n","stop_words_de = set(stopwords.words('german'))"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"k7Nvhp-46dHc","colab_type":"code","colab":{}},"source":["# Read the files in\n","with open(\"./train.ende.src\", \"r\") as ende_src:\n"," en_sentences = [line.rstrip() for line in ende_src]\n","with open(\"./train.ende.mt\", \"r\") as ende_mt:\n"," de_sentences = [line.rstrip() for line in ende_mt]\n","with open(\"./train.ende.scores\", \"r\") as ende_scores:\n"," scores = [float(line.rstrip()) for line in ende_scores]"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"FQlmnyA1-L7Z","colab_type":"code","colab":{}},"source":["# Tokenisation\n","# https://spacy.io/usage/linguistic-features#tokenization\n","# Splits the word, removes punctuation\n","# Create embedding vectors out of sentences\n","\n","punctuation = list(string.punctuation)\n","\n","en_sentences_vectors = []\n","for sentence in en_sentences:\n"," sentence_vectors = []\n"," for word in spacy_en.tokenizer(sentence.lower()):\n"," if word.text in stop_words_en or word.text in punctuation:\n"," continue\n"," sentence_vectors.append(word.vector)\n"," en_sentences_vectors.append(sentence_vectors)\n","\n","de_sentences_vectors = []\n","for sentence in de_sentences:\n"," sentence_vectors = []\n"," for word in spacy_de.tokenizer(sentence.lower()):\n"," if word.text in stop_words_de or word.text in punctuation:\n"," continue\n"," sentence_vectors.append(word.vector)\n"," de_sentences_vectors.append(sentence_vectors)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"auKS7OpTMZhk","colab_type":"code","colab":{}},"source":["# Save pretrained embeddings\n","with open(\"en_vectors.pk\", \"wb\") as f:\n"," pickle.dump(en_sentences_vectors, f)\n","with open(\"de_vectors.pk\", \"wb\") as f:\n"," pickle.dump(de_sentences_vectors, f)\n","with open(\"scores.pk\", \"wb\") as f:\n"," pickle.dump(scores, f)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"cpYwbE5z9EIr","colab_type":"code","colab":{}},"source":["# Only tokenise so we can create our own embeddings\n","\n","punctuation = list(string.punctuation)\n","\n","en_sentences_tokenized = []\n","en_set_words = set()\n","for sentence in en_sentences:\n"," sentence_tokenized = []\n"," for word in spacy_en.tokenizer(sentence.lower()):\n"," word_str = word.text\n"," if word_str in stop_words_en or word_str in punctuation:\n"," continue\n"," sentence_tokenized.append(word_str)\n"," en_set_words.add(word_str)\n"," en_sentences_tokenized.append(sentence_tokenized)\n","\n","de_sentences_tokenized = []\n","de_set_words = set()\n","for sentence in de_sentences:\n"," sentence_tokenized = []\n"," for word in spacy_de.tokenizer(sentence.lower()):\n"," word_str = word.text\n"," if word_str in stop_words_de or word_str in punctuation:\n"," continue\n"," sentence_tokenized.append(word_str)\n"," de_set_words.add(word_str)\n"," de_sentences_tokenized.append(sentence_tokenized)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"UL2LrLiM-Zkj","colab_type":"code","colab":{}},"source":["# Use code from tutorial to create a vocabulary with which to map words to indices\n","class Vocabulary(object):\n"," def __init__(self):\n"," # Mapping from tokens to integers\n"," self._word2idx = {}\n","\n"," # Reverse-mapping from integers to tokens\n"," self.idx2word = []\n","\n"," # 0-padding token\n"," self.add_word('<pad>')\n"," # Unknown words\n"," self.add_word('<unk>')\n","\n"," self._unk_idx = self._word2idx['<unk>']\n","\n"," def word2idx(self, word):\n"," return self._word2idx.get(word, self._unk_idx)\n","\n"," def add_word(self, word):\n"," if word not in self._word2idx:\n"," self.idx2word.append(word)\n"," self._word2idx[word] = len(self.idx2word) - 1\n","\n"," def build_from_set(self, words):\n"," \"\"\"Builds a vocabulary from a given corpus file.\"\"\"\n"," for word in words:\n"," self.add_word(word)\n","\n"," def convert_idxs_to_words(self, idxs):\n"," \"\"\"Converts a list of indices to words.\"\"\"\n"," return ' '.join(self.idx2word[idx] for idx in idxs)\n","\n"," def convert_words_to_idxs(self, words):\n"," \"\"\"Converts a list of words to a list of indices.\"\"\"\n"," return [self.word2idx(w) for w in words]\n","\n"," def __len__(self):\n"," \"\"\"Returns the size of the vocabulary.\"\"\"\n"," return len(self.idx2word)\n"," \n"," def __repr__(self):\n"," return \"Vocabulary with {} items\".format(self.__len__())"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"pHyLTcCl-21D","colab_type":"code","outputId":"c62f1b72-aed1-45c2-d0f5-8a35f04c6ed4","executionInfo":{"status":"ok","timestamp":1582493902578,"user_tz":0,"elapsed":368708,"user":{"displayName":"Andy Wang","photoUrl":"","userId":"02776860930356410397"}},"colab":{"base_uri":"https://localhost:8080/","height":68}},"source":["# Initialise vocabularies\n","en_vocab = Vocabulary()\n","en_vocab.build_from_set(en_set_words)\n","de_vocab = Vocabulary()\n","de_vocab.build_from_set(de_set_words)\n","\n","# We explore using only one embedding space for both languages\n","unified_vocab = Vocabulary()\n","unified_vocab.build_from_set(en_set_words.union(de_set_words))\n","\n","print(f\"Vocab size (EN): {len(en_vocab)}\")\n","print(f\"Vocab size (DE): {len(de_vocab)}\")\n","print(f\"Vocab size (UNIFIED): {len(unified_vocab)}\")"],"execution_count":9,"outputs":[{"output_type":"stream","text":["Vocab size (EN): 25237\n","Vocab size (DE): 27854\n","Vocab size (UNIFIED): 42154\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"oly_6eKdBs0R","colab_type":"code","colab":{}},"source":["# Convert to list of indices\n","en_indices = []\n","for sentence in en_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(en_vocab.word2idx(word))\n"," en_indices.append(sentence_indices)\n","\n","de_indices = []\n","for sentence in de_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(de_vocab.word2idx(word))\n"," de_indices.append(sentence_indices)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"wKEI5zZcBxcv","colab_type":"code","colab":{}},"source":["with open(\"en_indices.pk\", \"wb\") as f:\n"," pickle.dump(en_indices, f)\n","with open(\"de_indices.pk\", \"wb\") as f:\n"," pickle.dump(de_indices, f)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"EmWnoi_h6M_m","colab_type":"code","colab":{}},"source":["# Convert to list of indices, using shared vocab\n","en_indices = []\n","for sentence in en_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(unified_vocab.word2idx(word))\n"," en_indices.append(sentence_indices)\n","\n","de_indices = []\n","for sentence in de_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(unified_vocab.word2idx(word))\n"," de_indices.append(sentence_indices)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"PHy8DIID6Xtb","colab_type":"code","colab":{}},"source":["with open(\"en_indices_shared.pk\", \"wb\") as f:\n"," pickle.dump(en_indices, f)\n","with open(\"de_indices_shared.pk\", \"wb\") as f:\n"," pickle.dump(de_indices, f)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"GDRL7ZnZUt-l","colab_type":"code","colab":{}},"source":["# Create average embeddings for each sentence\n","en_sentences_vectors = []\n","for sentence in en_sentences:\n"," en_sentences_vectors.append(spacy_en.tokenizer(sentence.lower()).vector)\n","\n","de_sentences_vectors = []\n","for sentence in de_sentences:\n"," de_sentences_vectors.append(spacy_en.tokenizer(sentence.lower()).vector)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"OBLPz2izVMUq","colab_type":"code","colab":{}},"source":["with open(\"en_sentence_emb.pk\", \"wb\") as f:\n"," pickle.dump(en_sentences_vectors, f)\n","with open(\"de_sentence_emb.pk\", \"wb\") as f:\n"," pickle.dump(de_sentences_vectors, f)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"8VsV5Eod9gao","colab_type":"code","colab":{}},"source":["# Create embeddings for the test set\n","# Read the files in\n","with open(\"./test.ende.src\", \"r\") as ende_src:\n"," en_sentences_test = [line.rstrip() for line in ende_src]\n","with open(\"./test.ende.mt\", \"r\") as ende_mt:\n"," de_sentences_test = [line.rstrip() for line in ende_mt]"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Xif4ydIV96FU","colab_type":"code","colab":{}},"source":["punctuation = list(string.punctuation)\n","\n","en_sentences_vectors = []\n","for sentence in en_sentences_test:\n"," sentence_vectors = []\n"," for word in spacy_en.tokenizer(sentence.lower()):\n"," if word.text in stop_words_en or word.text in punctuation:\n"," continue\n"," sentence_vectors.append(word.vector)\n"," en_sentences_vectors.append(sentence_vectors)\n","\n","de_sentences_vectors = []\n","for sentence in de_sentences_test:\n"," sentence_vectors = []\n"," for word in spacy_de.tokenizer(sentence.lower()):\n"," if word.text in stop_words_de or word.text in punctuation:\n"," continue\n"," sentence_vectors.append(word.vector)\n"," de_sentences_vectors.append(sentence_vectors)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"9H-r9k7T98zb","colab_type":"code","colab":{}},"source":["with open(\"en_vectors_test.pk\", \"wb\") as f:\n"," pickle.dump(en_sentences_vectors, f)\n","with open(\"de_vectors_test.pk\", \"wb\") as f:\n"," pickle.dump(de_sentences_vectors, f)"],"execution_count":0,"outputs":[]}]}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment