Commit 1e78d04c authored by Wang, Andy's avatar Wang, Andy

Delete Preprocessing.ipynb

parent 68ec4bcf
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Preprocessing.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyOT044r/0G+oX0wOKZ1Vezc"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"e050-vUr6trI","colab_type":"code","outputId":"a59cdb7d-3f9a-4240-d213-8efe6bc841bf","executionInfo":{"status":"ok","timestamp":1582147217488,"user_tz":0,"elapsed":4378,"user":{"displayName":"Andy Wang","photoUrl":"","userId":"02776860930356410397"}},"colab":{"base_uri":"https://localhost:8080/","height":462}},"source":["# Download and unzip the data\n","from os.path import exists\n","if not exists('ende_data.zip'):\n"," !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d\n"," !unzip ende_data.zip"],"execution_count":1,"outputs":[{"output_type":"stream","text":["--2020-02-19 21:20:14-- https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d\n","Resolving competitions.codalab.org (competitions.codalab.org)... 129.175.22.230\n","Connecting to competitions.codalab.org (competitions.codalab.org)|129.175.22.230|:443... connected.\n","HTTP request sent, awaiting response... 302 FOUND\n","Location: https://newcodalab.lri.fr/prod-private/dataset_data_file/None/104ea/en-de.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=b5c06f73ad0dc600662a86e8d7ad9be29fe996fd59d723caccbff14eb662f469&X-Amz-Date=20200219T212014Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20200219%2Fnewcodalab%2Fs3%2Faws4_request [following]\n","--2020-02-19 21:20:14-- https://newcodalab.lri.fr/prod-private/dataset_data_file/None/104ea/en-de.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=b5c06f73ad0dc600662a86e8d7ad9be29fe996fd59d723caccbff14eb662f469&X-Amz-Date=20200219T212014Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20200219%2Fnewcodalab%2Fs3%2Faws4_request\n","Resolving newcodalab.lri.fr (newcodalab.lri.fr)... 129.175.15.11\n","Connecting to newcodalab.lri.fr (newcodalab.lri.fr)|129.175.15.11|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 864010 (844K) [application/zip]\n","Saving to: ‘ende_data.zip’\n","\n","ende_data.zip 100%[===================>] 843.76K 1.50MB/s in 0.6s \n","\n","2020-02-19 21:20:15 (1.50 MB/s) - ‘ende_data.zip’ saved [864010/864010]\n","\n","Archive: ende_data.zip\n"," inflating: dev.ende.mt \n"," inflating: dev.ende.scores \n"," inflating: dev.ende.src \n"," inflating: test.ende.mt \n"," inflating: test.ende.src \n"," inflating: train.ende.mt \n"," inflating: train.ende.scores \n"," inflating: train.ende.src \n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"YkaCMyoz6xmQ","colab_type":"code","outputId":"63a9184a-9827-48e3-c285-deab14212e1d","executionInfo":{"status":"ok","timestamp":1582147217489,"user_tz":0,"elapsed":4369,"user":{"displayName":"Andy Wang","photoUrl":"","userId":"02776860930356410397"}},"colab":{"base_uri":"https://localhost:8080/","height":153}},"source":["# Check the files\n","import io\n","\n","#English-German\n","print(\"---EN-DE---\")\n","print()\n","\n","with open(\"./train.ende.src\", \"r\") as ende_src:\n"," print(\"Source: \",ende_src.readline())\n","with open(\"./train.ende.mt\", \"r\") as ende_mt:\n"," print(\"Translation: \",ende_mt.readline())\n","with open(\"./train.ende.scores\", \"r\") as ende_scores:\n"," print(\"Score: \",ende_scores.readline())"],"execution_count":2,"outputs":[{"output_type":"stream","text":["---EN-DE---\n","\n","Source: José Ortega y Gasset visited Husserl at Freiburg in 1934.\n","\n","Translation: 1934 besuchte José Ortega y Gasset Husserl in Freiburg.\n","\n","Score: 1.1016968715664406\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"nE0SFuUa6Myu","colab_type":"code","outputId":"96455145-95e5-44db-e600-ba2df9412253","executionInfo":{"status":"ok","timestamp":1582147451355,"user_tz":0,"elapsed":238225,"user":{"displayName":"Andy Wang","photoUrl":"","userId":"02776860930356410397"}},"colab":{"base_uri":"https://localhost:8080/","height":612}},"source":["import spacy\n","import numpy as np\n","import string\n","import pickle\n","from nltk import download\n","from nltk.corpus import stopwords\n","\n","\n","# for colab\n","!spacy download de_core_news_md\n","!spacy link de_core_news_md de300\n","!spacy download en_core_web_lg\n","!spacy link en_core_web_lg en300\n","\n","# downloading stopwords from the nltk package, comment out after file has been loaded once\n","download('stopwords')\n"],"execution_count":3,"outputs":[{"output_type":"stream","text":["Collecting de_core_news_md==2.1.0\n","\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_md-2.1.0/de_core_news_md-2.1.0.tar.gz (220.8MB)\n","\u001b[K |████████████████████████████████| 220.8MB 1.2MB/s \n","\u001b[?25hBuilding wheels for collected packages: de-core-news-md\n"," Building wheel for de-core-news-md (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for de-core-news-md: filename=de_core_news_md-2.1.0-cp36-none-any.whl size=224546880 sha256=00efeaba2ce110db64c3a666053d14547fe011c7990278ea0d8f75389a2f2c22\n"," Stored in directory: /tmp/pip-ephem-wheel-cache-97quwred/wheels/44/34/f1/31d4b0fa32008c09695ccb180865f196ecd9d512c146f99749\n","Successfully built de-core-news-md\n","Installing collected packages: de-core-news-md\n","Successfully installed de-core-news-md-2.1.0\n","\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('de_core_news_md')\n","\u001b[38;5;2m✔ Linking successful\u001b[0m\n","/usr/local/lib/python3.6/dist-packages/de_core_news_md -->\n","/usr/local/lib/python3.6/dist-packages/spacy/data/de300\n","You can now load the model via spacy.load('de300')\n","Collecting en_core_web_lg==2.1.0\n","\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz (826.9MB)\n","\u001b[K |████████████████████████████████| 826.9MB 68.9MB/s \n","\u001b[?25hBuilding wheels for collected packages: en-core-web-lg\n"," Building wheel for en-core-web-lg (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for en-core-web-lg: filename=en_core_web_lg-2.1.0-cp36-none-any.whl size=828255078 sha256=aea699ef38d15d97968e5ea4c3a886968491b250ac0e4e36689f4963242f0d47\n"," Stored in directory: /tmp/pip-ephem-wheel-cache-d56fi7sb/wheels/b4/d7/70/426d313a459f82ed5e06cc36a50e2bb2f0ec5cb31d8e0bdf09\n","Successfully built en-core-web-lg\n","Installing collected packages: en-core-web-lg\n","Successfully installed en-core-web-lg-2.1.0\n","\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the model via spacy.load('en_core_web_lg')\n","\u001b[38;5;2m✔ Linking successful\u001b[0m\n","/usr/local/lib/python3.6/dist-packages/en_core_web_lg -->\n","/usr/local/lib/python3.6/dist-packages/spacy/data/en300\n","You can now load the model via spacy.load('en300')\n","[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Unzipping corpora/stopwords.zip.\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{"tags":[]},"execution_count":3}]},{"cell_type":"markdown","metadata":{"id":"UhjZ9o1P6Q2Y","colab_type":"text"},"source":[""]},{"cell_type":"code","metadata":{"id":"XEJrPju-6URD","colab_type":"code","colab":{}},"source":["# Load into memory\n","spacy_en = spacy.load('en300')\n","spacy_de = spacy.load('de300')\n","\n","# Stop words from nltk\n","stop_words_en = set(stopwords.words('english'))\n","stop_words_de = set(stopwords.words('german'))"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"k7Nvhp-46dHc","colab_type":"code","colab":{}},"source":["# Read the files in\n","with open(\"./train.ende.src\", \"r\") as ende_src:\n"," en_sentences = [line.rstrip() for line in ende_src]\n","with open(\"./train.ende.mt\", \"r\") as ende_mt:\n"," de_sentences = [line.rstrip() for line in ende_mt]\n","with open(\"./train.ende.scores\", \"r\") as ende_scores:\n"," scores = [float(line.rstrip()) for line in ende_scores]"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"FQlmnyA1-L7Z","colab_type":"code","colab":{}},"source":["# Tokenisation\n","# https://spacy.io/usage/linguistic-features#tokenization\n","# Splits the word, removes punctuation\n","# Create embedding vectors out of sentences\n","\n","punctuation = list(string.punctuation)\n","\n","en_sentences_vectors = []\n","for sentence in en_sentences:\n"," sentence_vectors = []\n"," for word in spacy_en.tokenizer(sentence.lower()):\n"," if word.text in stop_words_en or word.text in punctuation:\n"," continue\n"," sentence_vectors.append(word.vector)\n"," en_sentences_vectors.append(sentence_vectors)\n","\n","de_sentences_vectors = []\n","for sentence in de_sentences:\n"," sentence_vectors = []\n"," for word in spacy_de.tokenizer(sentence.lower()):\n"," if word.text in stop_words_de or word.text in punctuation:\n"," continue\n"," sentence_vectors.append(word.vector)\n"," de_sentences_vectors.append(sentence_vectors)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"auKS7OpTMZhk","colab_type":"code","colab":{}},"source":["with open(\"en_vectors.pk\", \"wb\") as f:\n"," pickle.dump(en_sentences_vectors, f)\n","with open(\"de_vectors.pk\", \"wb\") as f:\n"," pickle.dump(de_sentences_vectors, f)\n","with open(\"scores.pk\", \"wb\") as f:\n"," pickle.dump(scores, f)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"cpYwbE5z9EIr","colab_type":"code","colab":{}},"source":["# Tokenisation\n","# https://spacy.io/usage/linguistic-features#tokenization\n","# Splits the word, removes punctuation\n","\n","punctuation = list(string.punctuation)\n","\n","en_sentences_tokenized = []\n","en_set_words = set()\n","for sentence in en_sentences:\n"," sentence_tokenized = []\n"," for word in spacy_en.tokenizer(sentence.lower()):\n"," word_str = word.text\n"," if word_str in stop_words_en or word_str in punctuation:\n"," continue\n"," sentence_tokenized.append(word_str)\n"," en_set_words.add(word_str)\n"," en_sentences_tokenized.append(sentence_tokenized)\n","\n","de_sentences_tokenized = []\n","de_set_words = set()\n","for sentence in de_sentences:\n"," sentence_tokenized = []\n"," for word in spacy_de.tokenizer(sentence.lower()):\n"," word_str = word.text\n"," if word_str in stop_words_de or word_str in punctuation:\n"," continue\n"," sentence_tokenized.append(word_str)\n"," de_set_words.add(word_str)\n"," de_sentences_tokenized.append(sentence_tokenized)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"UL2LrLiM-Zkj","colab_type":"code","colab":{}},"source":["class Vocabulary(object):\n"," def __init__(self):\n"," # Mapping from tokens to integers\n"," self._word2idx = {}\n","\n"," # Reverse-mapping from integers to tokens\n"," self.idx2word = []\n","\n"," # 0-padding token\n"," self.add_word('<pad>')\n"," # Unknown words\n"," self.add_word('<unk>')\n","\n"," self._unk_idx = self._word2idx['<unk>']\n","\n"," def word2idx(self, word):\n"," return self._word2idx.get(word, self._unk_idx)\n","\n"," def add_word(self, word):\n"," if word not in self._word2idx:\n"," self.idx2word.append(word)\n"," self._word2idx[word] = len(self.idx2word) - 1\n","\n"," def build_from_set(self, words):\n"," \"\"\"Builds a vocabulary from a given corpus file.\"\"\"\n"," for word in words:\n"," self.add_word(word)\n","\n"," def convert_idxs_to_words(self, idxs):\n"," \"\"\"Converts a list of indices to words.\"\"\"\n"," return ' '.join(self.idx2word[idx] for idx in idxs)\n","\n"," def convert_words_to_idxs(self, words):\n"," \"\"\"Converts a list of words to a list of indices.\"\"\"\n"," return [self.word2idx(w) for w in words]\n","\n"," def __len__(self):\n"," \"\"\"Returns the size of the vocabulary.\"\"\"\n"," return len(self.idx2word)\n"," \n"," def __repr__(self):\n"," return \"Vocabulary with {} items\".format(self.__len__())"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"pHyLTcCl-21D","colab_type":"code","outputId":"f1de55db-a722-4a8f-83e0-cd6c06427fa6","executionInfo":{"status":"ok","timestamp":1582140226209,"user_tz":0,"elapsed":470,"user":{"displayName":"Andy Wang","photoUrl":"","userId":"02776860930356410397"}},"colab":{"base_uri":"https://localhost:8080/","height":68}},"source":["en_vocab = Vocabulary()\n","en_vocab.build_from_set(en_set_words)\n","de_vocab = Vocabulary()\n","de_vocab.build_from_set(de_set_words)\n","\n","unified_vocab = Vocabulary()\n","unified_vocab.build_from_set(en_set_words.union(de_set_words))\n","\n","print(f\"Vocab size (EN): {len(en_vocab)}\")\n","print(f\"Vocab size (DE): {len(de_vocab)}\")\n","print(f\"Vocab size (UNIFIED): {len(unified_vocab)}\")"],"execution_count":0,"outputs":[{"output_type":"stream","text":["25237\n","27854\n","42154\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"oly_6eKdBs0R","colab_type":"code","colab":{}},"source":["# Convert to list of indices\n","en_indices = []\n","for sentence in en_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(en_vocab.word2idx(word))\n"," en_indices.append(sentence_indices)\n","\n","de_indices = []\n","for sentence in de_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(de_vocab.word2idx(word))\n"," de_indices.append(sentence_indices)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"wKEI5zZcBxcv","colab_type":"code","colab":{}},"source":["with open(\"en_indices.pk\", \"wb\") as f:\n"," pickle.dump(en_indices, f)\n","with open(\"de_indices.pk\", \"wb\") as f:\n"," pickle.dump(de_indices, f)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"EmWnoi_h6M_m","colab_type":"code","colab":{}},"source":["# Convert to list of indices, using shared vocab\n","en_indices = []\n","for sentence in en_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(unified_vocab.word2idx(word))\n"," en_indices.append(sentence_indices)\n","\n","de_indices = []\n","for sentence in de_sentences_tokenized:\n"," sentence_indices = []\n"," for word in sentence:\n"," sentence_indices.append(unified_vocab.word2idx(word))\n"," de_indices.append(sentence_indices)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"PHy8DIID6Xtb","colab_type":"code","colab":{}},"source":["with open(\"en_indices_shared.pk\", \"wb\") as f:\n"," pickle.dump(en_indices, f)\n","with open(\"de_indices_shared.pk\", \"wb\") as f:\n"," pickle.dump(de_indices, f)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"GDRL7ZnZUt-l","colab_type":"code","colab":{}},"source":["# Create average embeddings for each sentence\n","en_sentences_vectors = []\n","for sentence in en_sentences:\n"," en_sentences_vectors.append(spacy_en.tokenizer(sentence.lower()).vector)\n","\n","de_sentences_vectors = []\n","for sentence in de_sentences:\n"," de_sentences_vectors.append(spacy_en.tokenizer(sentence.lower()).vector)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"OBLPz2izVMUq","colab_type":"code","colab":{}},"source":["with open(\"en_sentence_emb.pk\", \"wb\") as f:\n"," pickle.dump(en_sentences_vectors, f)\n","with open(\"de_sentence_emb.pk\", \"wb\") as f:\n"," pickle.dump(de_sentences_vectors, f)"],"execution_count":0,"outputs":[]}]}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment