diff --git a/.DS_Store b/.DS_Store index eaecae14706f3bb903f60c2ca7f788228d245714..ce6f4784a89a4752679177b8a1c758c5685c0b78 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb b/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb index 352dbc41154b8a4ced14887e66fb434cb7288f7b..c53b4e085e9110d720df491fb0b49741e84eb894 100644 --- a/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb +++ b/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb @@ -666,10 +666,8 @@ "metadata": {}, "outputs": [], "source": [ - "wordEmbeddingModel = naw.ContextualWordEmbsAug(\n", - " model_path='bert-base-uncased', action=\"substitute\", \n", - " aug_p=0.2, device='cuda',\n", - ")" + "import os\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" ] }, { @@ -678,14 +676,10 @@ "metadata": {}, "outputs": [], "source": [ - "trdf1_word_embedding_subst = trdf1.copy()\n", - "pat_sent = trdf1.loc[trdf1['label'] == 1]\n", - "for i in range(1):\n", - " print(f\"Epoch {i}\")\n", - " # to potentially break this down into batches -> very slow! 1 run takes 17m smh\n", - " pat_sent_synonym = pat_sent.copy()\n", - " pat_sent_synonym['text'] = pat_sent_synonym['text'].apply(lambda x: wordEmbeddingModel.augment(x))\n", - " trdf1_word_embedding_subst = pd.concat([trdf1_word_embedding_subst, pat_sent_synonym], ignore_index=True)\n" + "wordEmbeddingModel = naw.ContextualWordEmbsAug(\n", + " model_path='bert-base-uncased', action=\"substitute\", \n", + " aug_p=0.2, device='cuda',\n", + ")" ] }, { @@ -694,7 +688,58 @@ "metadata": {}, "outputs": [], "source": [ - "trdf1_word_embedding_subst.value_counts()" + "trdf1_copy = trdf1.copy()\n", + "pat_sent = trdf1.loc[trdf1['label'] == 1]\n", + "\n", + "def create_dataset(extended_dataset):\n", + " df = pd.DataFrame(extended_dataset, columns=[\"text\"])\n", + " df[\"label\"] = 1\n", + " return df \n", + "\n", + "def extend_dataset_with_embedding_substitution(pat_sent, percentage_of_words):\n", + " bs = 64\n", + " total_len = len(pat_sent)\n", + " current_len = 0\n", + "\n", + " wordEmbeddingModel = naw.ContextualWordEmbsAug(\n", + " model_path='bert-base-uncased', action=\"substitute\", \n", + " aug_p=percentage_of_words, device='cuda',\n", + " )\n", + "\n", + " altered_dataset = []\n", + "\n", + " while current_len < total_len:\n", + " print(f\"Batch {current_len}\")\n", + " curr_batch_start = current_len\n", + " next_batch_start = total_len if current_len + bs > total_len else current_len + bs\n", + " curr_batch = pat_sent[curr_batch_start:next_batch_start].to_list()\n", + " current_len = next_batch_start\n", + " pat_sent_subst = wordEmbeddingModel.augment(curr_batch)\n", + " altered_dataset.append(pat_sent_subst)\n", + " \n", + " return np.concatenate(altered_dataset)\n", + "\n", + "\n", + "def apply_embedding(data, percentage_of_words=20, percentage_of_data=20):\n", + " non_pat_sent = data.loc[data['label'] == 0].copy()\n", + " pat_sent = data.loc[data['label'] == 1].copy()\n", + "\n", + " non_num_of_data_augmenting = int(percentage_of_data / 100 * len(non_pat_sent))\n", + " pat_num_of_data_augmenting = int(percentage_of_data / 100 * len(pat_sent))\n", + "\n", + " # extract percentage_of_data into non_pat_sent and pat_sent\n", + " # non_pat_sent = non_pat_sent[:non_num_of_data_augmenting]\n", + " pat_sent = pat_sent[:pat_num_of_data_augmenting]\n", + "\n", + " additional_augmented_pat_sent = extend_dataset_with_embedding_substitution(pat_sent['text'], percentage_of_words)\n", + "\n", + " augmented_pat_sent_df = create_dataset(additional_augmented_pat_sent)\n", + "\n", + " len_non_pat_sent = len(pat_sent) * 2\n", + "\n", + " additional_non_pat_sent_data = non_pat_sent[1270: 1270 + len_non_pat_sent]\n", + " \n", + " return augmented_pat_sent_df, additional_non_pat_sent_data" ] }, { @@ -764,6 +809,17 @@ "# RoBERTa Baseline for Task 1" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# drop irrelevant columns\n", + "trdf1 = trdf1.drop(\"par_id\", axis=1)\n", + "tedf1 = tedf1.drop(\"par_id\", axis=1)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -857,17 +913,22 @@ " \"adafactor_warmup_init\": False\n", "}\n", "\n", - "def preprocess(data, use_synonyms=False, use_embedding=False, use_translate=False):\n", + "def preprocess(data, use_synonyms=False, use_embedding=False, use_translate=False, word_percent=20, data_precent=20):\n", + " new_df = data\n", " if use_synonyms:\n", - " data = apply_synonyms(data)\n", + " augmented = apply_synonyms(data)\n", + " new_df = pd.concat([new_df, augmented], ignore_index=True)\n", " if use_embedding:\n", - " data = apply_embedding(data)\n", + " augmented_data, non_pat_data = apply_embedding(data, word_percent, data_precent)\n", + " new_df = pd.concat([new_df, augmented_data, non_pat_data], ignore_index=True)\n", " if use_translate:\n", - " data = apply_translate(data)\n", - " return data\n", + " augmented = apply_translate(data)\n", + " new_df = pd.concat([new_df, augmented], ignore_index=True)\n", "\n", + " return new_df\n", "\n", - "def train_model(model, model_name, train, val, hyperparams, use_synonyms=False, use_embedding=False, use_translate=False):\n", + "\n", + "def train_model(model, model_name, train, val, hyperparams, use_synonyms=False, use_embedding=False, use_translate=False, word_percent=20, data_percent=20):\n", " ## Normal training\n", " task1_model_args = ClassificationArgs(\n", " no_save=True, \n", @@ -885,7 +946,7 @@ "\n", " \n", " # train model\n", - " train = preprocess(train, use_synonyms, use_embedding, use_translate)\n", + " train = preprocess(train, use_synonyms, use_embedding, use_translate, word_percent, data_percent)\n", " task1_model.train_model(train[['text', 'label']], eval_df=val[['text', 'label']])\n", " return task1_model\n", "\n", @@ -914,18 +975,6 @@ "training_set1_lower_case['text'] = training_set1['text'].apply(lambda x: x.lower())" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train\n", - "task1_model = train_model(\"roberta\", \"roberta-base\", training_set1_lower_case, val_set1, hyperparams)\n", - "preds_task1 = test_model(task1_model)\n", - "print(f\"accuracy: {calc_accuracy(preds_task1)}\")" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/runs/aug_synonym_sent_20_rows_20/events.out.tfevents.1646240179.nkc6srymca.119.0 b/runs/aug_synonym_sent_20_rows_20/events.out.tfevents.1646240179.nkc6srymca.119.0 new file mode 100644 index 0000000000000000000000000000000000000000..d8861f82d2a2b4f844b13ddac28beaf84caa9f3a Binary files /dev/null and b/runs/aug_synonym_sent_20_rows_20/events.out.tfevents.1646240179.nkc6srymca.119.0 differ diff --git a/runs/aug_synonym_sent_20_rows_40/events.out.tfevents.1646241040.nkc6srymca.119.1 b/runs/aug_synonym_sent_20_rows_40/events.out.tfevents.1646241040.nkc6srymca.119.1 new file mode 100644 index 0000000000000000000000000000000000000000..68b5d3a68678e756f48c15408ebae9dcd1a2085f Binary files /dev/null and b/runs/aug_synonym_sent_20_rows_40/events.out.tfevents.1646241040.nkc6srymca.119.1 differ diff --git a/runs/aug_synonym_sent_20_rows_60/events.out.tfevents.1646242030.nkc6srymca.119.2 b/runs/aug_synonym_sent_20_rows_60/events.out.tfevents.1646242030.nkc6srymca.119.2 new file mode 100644 index 0000000000000000000000000000000000000000..27858cef3501cf2b05253bfc015bfa8ede195a74 Binary files /dev/null and b/runs/aug_synonym_sent_20_rows_60/events.out.tfevents.1646242030.nkc6srymca.119.2 differ diff --git a/runs/aug_synonym_sent_40_rows_20/events.out.tfevents.1646243152.nkc6srymca.119.3 b/runs/aug_synonym_sent_40_rows_20/events.out.tfevents.1646243152.nkc6srymca.119.3 new file mode 100644 index 0000000000000000000000000000000000000000..bccb11298fc8421f2004668a992a88bfdb3afeee Binary files /dev/null and b/runs/aug_synonym_sent_40_rows_20/events.out.tfevents.1646243152.nkc6srymca.119.3 differ diff --git a/runs/aug_synonym_sent_40_rows_40/events.out.tfevents.1646244018.nkc6srymca.119.4 b/runs/aug_synonym_sent_40_rows_40/events.out.tfevents.1646244018.nkc6srymca.119.4 new file mode 100644 index 0000000000000000000000000000000000000000..dbe3a7139d5bdb5983e46c28fc12dd5f48f39cf8 Binary files /dev/null and b/runs/aug_synonym_sent_40_rows_40/events.out.tfevents.1646244018.nkc6srymca.119.4 differ diff --git a/runs/aug_synonym_sent_40_rows_60/events.out.tfevents.1646245009.nkc6srymca.119.5 b/runs/aug_synonym_sent_40_rows_60/events.out.tfevents.1646245009.nkc6srymca.119.5 new file mode 100644 index 0000000000000000000000000000000000000000..b7ac38430b7b5f6639362ac1e380b80a45dfdc79 Binary files /dev/null and b/runs/aug_synonym_sent_40_rows_60/events.out.tfevents.1646245009.nkc6srymca.119.5 differ diff --git a/runs/aug_synonym_sent_60_rows_20/events.out.tfevents.1646246135.nkc6srymca.119.6 b/runs/aug_synonym_sent_60_rows_20/events.out.tfevents.1646246135.nkc6srymca.119.6 new file mode 100644 index 0000000000000000000000000000000000000000..3d48e77303cd0be798dea6edf2c79a8d2d4871d6 Binary files /dev/null and b/runs/aug_synonym_sent_60_rows_20/events.out.tfevents.1646246135.nkc6srymca.119.6 differ diff --git a/runs/aug_synonym_sent_60_rows_40/events.out.tfevents.1646247001.nkc6srymca.119.7 b/runs/aug_synonym_sent_60_rows_40/events.out.tfevents.1646247001.nkc6srymca.119.7 new file mode 100644 index 0000000000000000000000000000000000000000..1a03893083b98865fbbbc097dece8f6c7f9ae25a Binary files /dev/null and b/runs/aug_synonym_sent_60_rows_40/events.out.tfevents.1646247001.nkc6srymca.119.7 differ diff --git a/runs/aug_synonym_sent_60_rows_60/events.out.tfevents.1646247992.nkc6srymca.119.8 b/runs/aug_synonym_sent_60_rows_60/events.out.tfevents.1646247992.nkc6srymca.119.8 new file mode 100644 index 0000000000000000000000000000000000000000..b29acd95e1a9aca996051e213c75724baa6abfd1 Binary files /dev/null and b/runs/aug_synonym_sent_60_rows_60/events.out.tfevents.1646247992.nkc6srymca.119.8 differ