diff --git a/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb b/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb index 92e9af3949fa77a9252d7f03b79fba2643e2b236..049d8415b540fba32de925ee2d821546e4a130b1 100644 --- a/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb +++ b/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb @@ -84,7 +84,7 @@ "logging.basicConfig(level=logging.INFO)\n", "\n", "transformers_logger = logging.getLogger(\"transformers\")\n", - "transformers_logger.setLevel(logging.WARNING)\n", + "transformers_logger.setLevel(logging.ERROR)\n", "\n", "# check gpu\n", "cuda_available = torch.cuda.is_available()\n", @@ -273,10 +273,14 @@ " # select row from original dataset to retrieve `text` and binary label\n", " text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n", " label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]\n", + " keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]\n", + " country = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].country.values[0]\n", " rows.append({\n", - " 'par_id':parid,\n", + " # 'par_id':parid,\n", " 'text':text,\n", - " 'label':label\n", + " 'labels':label,\n", + " 'keyword':keyword,\n", + " 'country':country\n", " })\n", " " ] @@ -310,6 +314,13 @@ "trdf1.shape[0]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Frequency of class labels (Q1.1)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -323,7 +334,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Discussion regarding Analysis of class labels**\n", + "## **Discussion regarding Analysis of class labels**\n", "\n", "The dataset is a skewed dataset, with 10 times more sentences not exhibiting pcl compared to sentences exhibiting pcl." ] @@ -478,6 +489,52 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculating correlation between input data feature vs label data (Q1.2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trdf1[\"input_length\"] = paragraphs.apply(lambda x : len(x.strip().split(\" \")))\n", + "\n", + "trdf1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculating correlation for input data features\n", + "# Features include input_length, avg_sentence_length, num_sentences_in_paragraph\n", + "\n", + "def calculate_corr_score(input_feature):\n", + " return trdf1[\"label\"].corr(trdf1[input_feature])\n", + "\n", + "corr_lbls_and_input_length = calculate_corr_score(\"input_length\")\n", + "corr_lbls_and_avg_sent_len = calculate_corr_score(\"avg_sentence_length\")\n", + "corr_num_sentenes_w_lbls = calculate_corr_score(\"num_sentences_in_paragraph\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Correlation score between labels and input length: {corr_lbls_and_input_length}\")\n", + "print(f\"Correlation score between labels and avg sentence length: {corr_lbls_and_avg_sent_len}\")\n", + "print(f\"Correlation score between labels and num sentences in paragraph: {corr_num_sentenes_w_lbls}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -690,9 +747,11 @@ "trdf1_copy = trdf1.copy()\n", "pat_sent = trdf1.loc[trdf1['label'] == 1]\n", "\n", - "def create_dataset(extended_dataset):\n", + "def create_dataset(extended_dataset, keyword, country):\n", " df = pd.DataFrame(extended_dataset, columns=[\"text\"])\n", " df[\"label\"] = 1\n", + " df[\"keyword\"] = keyword.tolist()\n", + " df[\"country\"] = country.tolist()\n", " return df \n", "\n", "def extend_dataset_with_embedding_substitution(pat_sent, percentage_of_words):\n", @@ -727,12 +786,12 @@ " pat_num_of_data_augmenting = int(percentage_of_data / 100 * len(pat_sent))\n", "\n", " # extract percentage_of_data into non_pat_sent and pat_sent\n", - " # non_pat_sent = non_pat_sent[:non_num_of_data_augmenting]\n", + "\n", " pat_sent = pat_sent[:pat_num_of_data_augmenting]\n", "\n", " additional_augmented_pat_sent = extend_dataset_with_embedding_substitution(pat_sent['text'], percentage_of_words)\n", "\n", - " augmented_pat_sent_df = create_dataset(additional_augmented_pat_sent)\n", + " augmented_pat_sent_df = create_dataset(additional_augmented_pat_sent, pat_sent[\"keyword\"], pat_sent[\"country\"])\n", "\n", " len_non_pat_sent = len(pat_sent) * 2\n", "\n", @@ -741,6 +800,15 @@ " return augmented_pat_sent_df, additional_non_pat_sent_data" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dpm.train_task1_df.head()" + ] + }, { "cell_type": "markdown", "metadata": { @@ -761,14 +829,18 @@ "rows = [] # will contain par_id, label and text\n", "for idx in range(len(teids)): \n", " parid = teids.par_id[idx]\n", - " #print(parid)\n", + "\n", " # select row from original dataset\n", " text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n", " label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]\n", + " keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]\n", + " country = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].country.values[0] \n", " rows.append({\n", - " 'par_id':parid,\n", + " # 'par_id':parid,\n", " 'text':text,\n", - " 'label':label\n", + " 'labels':label,\n", + " 'keyword':keyword,\n", + " 'country':country\n", " })\n", " " ] @@ -799,6 +871,15 @@ "tedf1 = pd.DataFrame(rows)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tedf1.head()" + ] + }, { "cell_type": "markdown", "metadata": { @@ -808,17 +889,6 @@ "# RoBERTa Baseline for Task 1" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# drop irrelevant columns\n", - "trdf1 = trdf1.drop(\"par_id\", axis=1)\n", - "tedf1 = tedf1.drop(\"par_id\", axis=1)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -832,10 +902,10 @@ "outputs": [], "source": [ "# downsample negative instances\n", - "pcldf = trdf1[trdf1.label==1]\n", + "pcldf = trdf1[trdf1.labels==1]\n", "npos = len(pcldf)\n", "\n", - "train_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])" + "train_set1 = pd.concat([pcldf,trdf1[trdf1.labels==0][:npos*2]])" ] }, { @@ -851,39 +921,17 @@ }, "outputs": [], "source": [ - "non_pat_set1 = train_set1.loc[train_set1['label'] == 0]\n", + "non_pat_set1 = train_set1.loc[train_set1['labels'] == 0]\n", "non_split = int(0.8 * len(non_pat_set1))\n", "non_train, non_val = non_pat_set1[:non_split], non_pat_set1[non_split:]\n", "\n", - "pat_set1 = train_set1.loc[train_set1['label'] == 1]\n", + "pat_set1 = train_set1.loc[train_set1['labels'] == 1]\n", "pat_split = int(0.8 * len(pat_set1))\n", "pat_train, pat_val = pat_set1[:pat_split], pat_set1[pat_split:]\n", "\n", "training_set1, val_set1 = pd.concat([non_train, pat_train], ignore_index=True), pd.concat([non_val, pat_val], ignore_index=True)\n", - "print(training_set1['label'].value_counts())\n", - "print(val_set1['label'].value_counts())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# def precision(tp, fp):\n", - "# return tp / (tp + fp)\n", - "\n", - "# def recall(tp, fn):\n", - "# return tp / (tp + fn)\n", - "\n", - "# def f1(precision, recall):\n", - "# return 2 * precision * recall / (precision + recall)\n", - "\n", - "# p = precision(110, 50)\n", - "# r = recall(110, 50)\n", - "# f_score = f1(p, r)\n", - "\n", - "# print(f\"precision: {p}, recall: {r}, f1: {f_score}\")" + "print(training_set1['labels'].value_counts())\n", + "print(val_set1['labels'].value_counts())" ] }, { @@ -893,7 +941,7 @@ "outputs": [], "source": [ "def calc_accuracy(preds_task1):\n", - " test_labels = tedf1.label.to_list()\n", + " test_labels = tedf1.labels.to_list()\n", " correct = 0\n", " for i in range(len(preds_task1)):\n", " correct += preds_task1[i] == test_labels[i]\n", @@ -901,15 +949,19 @@ "\n", "# best hyperparams\n", "hyperparams = {\n", - " \"learning_rate\": 0.0005,\n", + " \"learning_rate\": 0.001,\n", " \"train_batch_size\": 16,\n", - " \"num_train_epochs\": 5,\n", + " \"num_train_epochs\": 10,\n", " \"optimizer\": \"Adafactor\",\n", " \"scheduler\":\"linear_schedule_with_warmup\",\n", " \"evaluate_during_training\": True,\n", - " \"evaluate_during_training_steps\": 50,\n", + " \"evaluate_during_training_steps\": 120,\n", " \"adafactor_relative_step\": False,\n", - " \"adafactor_warmup_init\": False\n", + " \"adafactor_warmup_init\": False,\n", + " \"use_early_stopping\": True,\n", + " \"early_stopping_delta\": 0.01,\n", + " \"loss_type\": \"focal\"\n", + " # \"early_stopping_consider_epochs\":True\n", "}\n", "\n", "def preprocess(data, use_synonyms=False, use_embedding=False, use_translate=False, word_percent=20, data_precent=20):\n", @@ -933,7 +985,7 @@ " no_save=True, \n", " no_cache=True, \n", " overwrite_output_dir=True,\n", - " logging_steps=50,\n", + " logging_steps=120,\n", " **hyperparams)\n", " task1_model = ClassificationModel(model, \n", " model_name, \n", @@ -944,21 +996,14 @@ " \n", " # train model\n", " train = preprocess(train, use_synonyms, use_embedding, use_translate, word_percent, data_percent)\n", - " task1_model.train_model(train[['text', 'label']], eval_df=val[['text', 'label']])\n", + " task1_model.train_model(train[['text', 'labels', 'keyword', 'country']], eval_df=val[['text', 'labels', 'keyword', 'country']])\n", " return task1_model\n", "\n", "def test_model(task1_model):\n", " # run predictions\n", + " \n", " preds_task1, _ = task1_model.predict(tedf1.text.tolist())\n", - " return preds_task1\n", - "\n", - "# hyperparams[\"train_batch_size\"] = bs \n", - "# task1_model = train_model(\"roberta\", \"roberta-base\", training_set1, val_set1, hyperparams)\n", - "# preds_task1 = test_model(task1_model)\n", - "# # all_preds.append(preds_task1)\n", - "# del task1_model\n", - "# torch.cuda.empty_cache()\n", - "# torch.cuda.synchronize()" + " return preds_task1\n" ] }, { @@ -972,6 +1017,112 @@ "training_set1_lower_case['text'] = training_set1['text'].apply(lambda x: x.lower())" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def precision(tp, fp):\n", + " try:\n", + " val = tp / (tp + fp)\n", + " return val\n", + " except:\n", + " return 0\n", + "\n", + "def recall(tp, fn):\n", + " try:\n", + " val = tp / (tp + fn)\n", + " return val\n", + " except:\n", + " return 0\n", + "\n", + "def f1(precision, recall):\n", + " if precision == 0 and recall == 0:\n", + " return 1\n", + " elif precision == 0 or recall == 0:\n", + " return 0\n", + " return 2 * precision * recall / (precision + recall)\n", + "\n", + "def get_tp(preds_data, true_labels):\n", + " correct = 0\n", + " for i in range(len(preds_data)):\n", + " correct += preds_data[i] == true_labels[i] and true_labels[i] == 1\n", + " return correct\n", + "\n", + "def get_fp(preds_data, true_labels):\n", + " correct = 0\n", + " for i in range(len(preds_data)):\n", + " correct += preds_data[i] == 1 and true_labels[i] == 0\n", + " return correct\n", + "\n", + "def get_tn(preds_data, true_labels):\n", + " correct = 0\n", + " for i in range(len(preds_data)):\n", + " correct += preds_data[i] == true_labels[i] and true_labels[i] == 0\n", + " return correct\n", + "\n", + "def get_fn(preds_data, true_labels):\n", + " correct = 0\n", + " for i in range(len(preds_data)):\n", + " correct += preds_data[i] == 0 and true_labels[i] == 1\n", + " return correct\n", + "\n", + "def eval_metrics(preds_data, true_labels):\n", + "\n", + " tp = get_tp(preds_data, true_labels)\n", + " fp = get_fp(preds_data, true_labels)\n", + " tn = get_tn(preds_data, true_labels)\n", + " fn = get_fn(preds_data, true_labels)\n", + "\n", + " p = precision(tp, fp)\n", + " r = recall(tp, fn)\n", + " \n", + " # print(f\"Precision: {p}, Recall: {r}\")\n", + "\n", + " f1_score = f1(p, r)\n", + " # print(f\"f1: {f1_score}\")\n", + "\n", + "\n", + " return p, r, f1_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "task1_model = train_model(\"roberta\", \"roberta-base\",\n", + " training_set1_lower_case, val_set1, hyperparams)\n", + "preds_task1 = test_model(task1_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"accuracy: {calc_accuracy(preds_task1)}\")\n", + "test_labels = tedf1.labels.to_list()\n", + "eval_metrics(preds_task1, test_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# best f1 so far is 0.5335 smth on lr of 0.001\n", + "# bert cased and uncased dont matter\n", + "\n", + "#lr = 0.001, 1e-3\n", + "#lr = 0.0005 5e-4\n", + "#lr = 0.002 2e-3" + ] + }, { "cell_type": "code", "execution_count": null, @@ -994,7 +1145,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Original training" + "## Original training" ] }, { @@ -1038,7 +1189,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Submission" + "## Analysis Questions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Q2" ] }, { @@ -1047,282 +1205,379 @@ "metadata": {}, "outputs": [], "source": [ - "labels2file([[k] for k in preds_task1], 'task1.txt')" + "preds_test, _ = task1_model.predict(tedf1.text.tolist())" ] }, { - "cell_type": "markdown", - "metadata": { - "id": "k7Cc_u5Oli7j" - }, + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "# Rebuild training set (Task 2)" + "tedf1[\"input_length\"] = paragraphs.apply(lambda x : len(x.strip().split(\" \")))\n", + "tedf1[\"input_length_bin_label\"] = tedf1[\"input_length\"].apply(lambda x : int(x / 5))" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "D2WLYT7wli7k" - }, + "metadata": {}, "outputs": [], "source": [ - "rows2 = [] # will contain par_id, label and text\n", - "for idx in range(len(trids)): \n", - " parid = trids.par_id[idx]\n", - " label = trids.label[idx]\n", - " # select row from original dataset to retrieve the `text` value\n", - " text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n", - " rows2.append({\n", - " 'par_id':parid,\n", - " 'text':text,\n", - " 'label':label\n", - " })\n", - " " + "tedf1[\"predicted_labels\"] = preds_test" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "LFqMMb5Jli7l" - }, + "metadata": {}, "outputs": [], "source": [ - "trdf2 = pd.DataFrame(rows2)" + "tedf1.head()" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 422 - }, - "id": "HayrC9q7mQPl", - "outputId": "db5f1bdf-c09a-4a57-f81e-612100e32b44" - }, + "metadata": {}, "outputs": [], "source": [ - "trdf2" + "# def precision(tp, fp):\n", + "# try:\n", + "# val = tp / (tp + fp)\n", + "# return val\n", + "# except:\n", + "# return 0\n", + "\n", + "# def recall(tp, fn):\n", + "# try:\n", + "# val = tp / (tp + fn)\n", + "# return val\n", + "# except:\n", + "# return 0\n", + "\n", + "# def f1(precision, recall):\n", + "# if precision == 0 and recall == 0:\n", + "# return 1\n", + "# elif precision == 0 or recall == 0:\n", + "# return 0\n", + "# return 2 * precision * recall / (precision + recall)\n", + "\n", + "# # p = precision(110, 50)\n", + "# # r = recall(110, 50)\n", + "# # f_score = f1(p, r)\n", + "\n", + "# # print(f\"precision: {p}, recall: {r}, f1: {f_score}\")\n", + "\n", + "# def get_tp(preds_data, true_labels):\n", + "# correct = 0\n", + "# for i in range(len(preds_data)):\n", + "# correct += preds_data[i] == true_labels[i] and true_labels[i] == 1\n", + "# return correct\n", + "\n", + "# def get_fp(preds_data, true_labels):\n", + "# correct = 0\n", + "# for i in range(len(preds_data)):\n", + "# correct += preds_data[i] == 1 and true_labels[i] == 0\n", + "# return correct\n", + "\n", + "# def get_tn(preds_data, true_labels):\n", + "# correct = 0\n", + "# for i in range(len(preds_data)):\n", + "# correct += preds_data[i] == true_labels[i] and true_labels[i] == 0\n", + "# return correct\n", + "\n", + "# def get_fn(preds_data, true_labels):\n", + "# correct = 0\n", + "# for i in range(len(preds_data)):\n", + "# correct += preds_data[i] == 0 and true_labels[i] == 1\n", + "# return correct\n", + "\n", + "# def eval_metrics(preds_data, true_labels):\n", + "\n", + "# tp = get_tp(preds_data, true_labels)\n", + "# fp = get_fp(preds_data, true_labels)\n", + "# tn = get_tn(preds_data, true_labels)\n", + "# fn = get_fn(preds_data, true_labels)\n", + "\n", + "# p = precision(tp, fp)\n", + "# r = recall(tp, fn)\n", + " \n", + "# # print(f\"Precision: {p}, Recall: {r}\")\n", + "\n", + "# f1_score = f1(p, r)\n", + "# # print(f\"f1: {f1_score}\")\n", + "\n", + "\n", + "# return p, r, f1_score" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "MxHLB_g0pfEb" - }, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#we should calculate classification metrics on sentences with input_length that are in bins of 5\n", + "# so we need to get indices of sentences with 0-5, 5-10, 10-15, 15-20, ......\n", + "# calculate f1 metrics on them\n", + "\n", + "\n", + "unique_bin_labels = tedf1[\"input_length_bin_label\"].unique()\n", + "\n", + "unique_bin_labels = sorted(unique_bin_labels)\n", + "\n", + "\n", + "classification_metrics = []\n", + "\n", + "print(unique_bin_labels)\n", + "\n", + "for bin_label in unique_bin_labels:\n", + " print(bin_label)\n", + " idx_of_bin_label = tedf1.index[tedf1[\"input_length_bin_label\"] == bin_label].tolist()\n", + " inputs_in_bin = tedf1.iloc[idx_of_bin_label]\n", + " pred_labels = inputs_in_bin[\"predicted_labels\"].tolist()\n", + " # print(f\"pred labels = {pred_labels}\")\n", + " actual_labels = inputs_in_bin[\"label\"].tolist()\n", + " # print(f\"actual labels: {actual_labels}\")\n", + " p, r, f1_score = eval_metrics(pred_labels, actual_labels)\n", + " classification_metrics.append((bin_label, p, r, f1_score))\n", + "\n", + "classification_metrics_df = pd.DataFrame(classification_metrics, columns=[\"bin_label\", \"precision\", \"recall\", \"f1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "trdf2.label = trdf2.label.apply(literal_eval)" + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bin_labels = classification_metrics_df[\"bin_label\"]\n", + "precision_scores = classification_metrics_df[\"precision\"]\n", + "recall_scores = classification_metrics_df[\"recall\"]\n", + "f1_scores = classification_metrics_df[\"f1\"]\n", + "\n", + "def plot_metric(xvals, yvals, title):\n", + " plt.plot(xvals, yvals)\n", + " plt.title(title)\n", + " plt.show()\n", + "\n", + "plot_metric(bin_labels, precision_scores, \"Bin labels aganist Precision\")\n", + "plot_metric(bin_labels, recall_scores, \"Bin labels aganist recall\")\n", + "plot_metric(bin_labels, f1_scores, \"Bin labels aganist f1\")" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Gukbmv0bli7l" - }, + "metadata": {}, "source": [ - "# Rebuild test set (Task 2)" + "### Q3" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "gjH-AJK1li7m" - }, + "metadata": {}, "outputs": [], "source": [ - "rows2 = [] # will contain par_id, label and text\n", - "for idx in range(len(teids)): \n", - " parid = teids.par_id[idx]\n", - " label = teids.label[idx]\n", - " #print(parid)\n", - " # select row from original dataset to access the `text` value\n", - " text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n", - " rows2.append({\n", - " 'par_id':parid,\n", - " 'text':text,\n", - " 'label':label\n", - " })\n", - " " + "unique_keywords = tedf1[\"keyword\"].unique()\n", + "\n", + "unique_keywords_metrics = []\n", + "for keyword in unique_keywords:\n", + " rows_with_keyword = tedf1[tedf1[\"keyword\"] == keyword]\n", + " pred_labels = rows_with_keyword[\"predicted_labels\"].tolist()\n", + " actual_labels = rows_with_keyword[\"labels\"].tolist()\n", + " p, r, f1_score = eval_metrics(pred_labels, actual_labels)\n", + " unique_keywords_metrics.append((keyword, p, r, f1_score))\n", + "\n", + "unique_keyword_metrics_df = pd.DataFrame(unique_keywords_metrics, columns=[\"keyword\", \"precision\", \"recall\", \"f1\"])\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "SRP-tn5wli7n" - }, + "metadata": {}, "outputs": [], "source": [ - "tedf2 = pd.DataFrame(rows2)" + "unique_countries = tedf1[\"country\"].unique()\n", + "\n", + "unique_country_metrics = []\n", + "for country in unique_countries:\n", + " rows_with_country = tedf1[tedf1[\"country\"] == country]\n", + " pred_labels = rows_with_country[\"predicted_labels\"].tolist()\n", + " actual_labels = rows_with_country[\"labels\"].tolist()\n", + " p, r, f1_score = eval_metrics(pred_labels, actual_labels)\n", + " unique_country_metrics.append((country, p, r, f1_score)) \n", + "\n", + "unique_country_metrics_df = pd.DataFrame(unique_country_metrics, columns=[\"country\", \"precision\", \"recall\", \"f1\"])" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 422 - }, - "id": "8U2lrfJiolku", - "outputId": "6bf1181c-3e95-4913-cceb-9cc9e08b6c29" - }, + "metadata": {}, "outputs": [], "source": [ - "tedf2" + "!pip install seaborn\n", + "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "81aQFjWqpbe2" - }, + "metadata": {}, "outputs": [], "source": [ - "tedf2.label = tedf2.label.apply(literal_eval)" + "country_labels = unique_country_metrics_df[\"country\"]\n", + "country_precision_scores = unique_country_metrics_df[\"precision\"]\n", + "country_recall_scores = unique_country_metrics_df[\"recall\"]\n", + "country_f1_scores = unique_country_metrics_df[\"f1\"]\n", + "\n", + "def plot_metrics_for_categorical_labels(xlabels, ylabels, title):\n", + " plt.figure(figsize=(10, 8))\n", + " x = range(len(ylabels))\n", + " plt.scatter(x, ylabels)\n", + " # plt.hist(ylabels, len(ylabels) - 1)\n", + " x_ticks = xlabels\n", + " plt.xticks(x, x_ticks)\n", + " plt.title(title)\n", + " plt.show()\n", + " # fig, ax = plt.subplots()\n", + " # ax.bar(ylabels, len(ylabels) - 1, width=0.8, align='center')\n", + " # ax.set(xticks=xlabels)\n", + " # plt.plot()\n", + "\n", + "plot_metrics_for_categorical_labels(country_labels, country_precision_scores, \"Country labels aganist Precision\")\n", + "plot_metrics_for_categorical_labels(country_labels, country_recall_scores, \"Country labels aganist Recall\")\n", + "plot_metrics_for_categorical_labels(country_labels, country_f1_scores, \"Country labels aganist F1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "keyword_labels = unique_keyword_metrics_df[\"keyword\"]\n", + "keyword_precision_scores = unique_keyword_metrics_df[\"precision\"]\n", + "keyword_recall_scores = unique_keyword_metrics_df[\"recall\"]\n", + "keyword_f1_scores = unique_keyword_metrics_df[\"f1\"]\n", + "\n", + "plot_metrics_for_categorical_labels(keyword_labels, keyword_precision_scores, \"Keyword labels aganist Precision\")\n", + "plot_metrics_for_categorical_labels(keyword_labels, keyword_recall_scores, \"Keyword labels aganist recall\")\n", + "plot_metrics_for_categorical_labels(keyword_labels, keyword_f1_scores, \"Keyword labels aganist f1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tedf1.head()" ] }, { "cell_type": "markdown", - "metadata": { - "id": "YKFiVaslbAiC" - }, + "metadata": {}, "source": [ - "# RoBERTa baseline for Task 2" + "# Submission" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rebuild codalab test set" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "hmr5ZZf5Ik5T" - }, + "metadata": {}, + "outputs": [], + "source": [ + "test_df = pd.read_csv(\"./test_set.tsv\", delimiter=\"\\t\", header=None)\n", + "print(len(test_df))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_df.columns = [\"par_id\", \"art_id\", \"keyword\", \"country\", \"text\"]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test_df.to_csv(\"./test_set.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "all_negs = trdf2[trdf2.label.apply(lambda x:sum(x) == 0)]\n", - "all_pos = trdf2[trdf2.label.apply(lambda x:sum(x) > 0)]\n", + "codalab_test_df = pd.read_csv(\"./test_set.csv\") \n", + "\n", + "codalab_test_df_paras = codalab_test_df[\"text\"].to_list()\n", "\n", - "training_set2 = pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]])" + "print(len(codalab_test_df[\"text\"]))" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 422 - }, - "id": "zyBcJoHtJHE2", - "outputId": "983b27a6-3bec-47bc-e564-79face4b061c" - }, + "metadata": {}, "outputs": [], "source": [ - "training_set2" + "#submission to codalab\n", + "preds_blind_test_set, _ = task1_model.predict(codalab_test_df_paras)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 379, - "referenced_widgets": [ - "22e0c7f26e6e462ba7f63d02ed4cc1f0", - "4c58352e4a8a4836ad9d90b68876cadc", - "0649eb3fbea04d98afd7241a9f175dd1", - "36909c71cfeb44c9aa48c96b0724d5a0", - "d447cc0c53974205ade693d0eff0bc7b", - "5cb366b5f6384c408e0b6c8c8e18ca67", - "22363e506fe341e3a59dfea0a972affb", - "1143e9991f3e443a89605fcc72fae27b", - "c88d5b03d6d9498ab972678dedb018bf", - "d79ae039a52a43c0a20ff96b2d32607b", - "05826d440af54576a054d0d8e0b212f6", - "f1e8ea21a9e6457e8dcd35b21be75b22", - "1400f1d23b19496ab51b253b5f87298b", - "5ef5f967ffff4b9689f6f72772e1d9fd", - "e59cd0b3c86e4be28ae6442694b22f9c", - "0ac5312eac44449c8b72030f0d064383", - "d12ad49c9fc54f86affd96e975b93186", - "e98cda63a023489cae4ada214ba5bb38", - "379253e799eb4c9b92342abeffd9b4a2", - "448426e53b934f8d8cfc96a94f95e4b1", - "89e65dddb1884e158cb3d1937843ccab", - "9e3eb217f2c349cf8a2b255ca5a34622", - "7473c3bed5854eacbc818363f4a1ab9a", - "04ae2805168b48da9953754e0989f073", - "7de6deb7064344faa9e4104be09bc1b6", - "dd69b800241e4bb3a856ecd46968b762", - "2ea5a9a7f9a44295ad5364f47926b9d2", - "5d195a90ff5143a998c4cb9ca851811b", - "c4425a4579084d37b000192df67e3ae1", - "e4508e3bbceb40538e0f01a40da539de", - "d7a3b9eef0e347278563217103b4253c", - "a23c1b483a9e4aca8dc12da65527b103", - "dc4ab3d0b89440a38725e54852db46df", - "31e1a0a3fe2f4228887c1abc7443e8ea", - "f107664ab872463fa5dbee7bcfe1a866", - "e778f7c68ce54014a5bfb3050e72292a", - "8ecaba0474a64ea3a4a400107f08b90e", - "3bedc272418949e8bb2897bcfae0a36b", - "5d63fdd645634af381db3abc93ce2b24", - "afd3d0f6506e464c98b9ad57a24c3e9d", - "08263433f8b6426693653ab9955e5410", - "bdda742fee1148d581d8a15e8a560869", - "c4dbb10f007f483a950a95555583dc2a", - "d66f8f8d82f9499488b6ffa16193b320", - "25fdc4ac2d714e5ba99b4f297726d36c", - "5e5ac5500b974cfcbf76d2e5d70adb3d", - "e4ccd1d746ab4f1f826477bb021c34a8", - "16d4a580157446daaa4f721d28d095fa", - "ec1e8ea22a8f4272b258fa19a2c3ec83", - "fba89fa62d6a413b90f3961755794a61", - "22f9701e81034b5ea8d1bdd38ab0d191", - "95392827ab8149388dd9af86d1fb03a1", - "92b7ffe0dd1b49d1acdb2b588f72569d", - "afd5528b85874795ac1d736dde15b810", - "99a8a52b7dfb4fa994a4b09feba8fe58" - ] - }, - "id": "ECb7_mwzbFa6", - "outputId": "d433e1aa-c3c4-4a00-9b2a-c04ccdc20588" - }, + "metadata": {}, "outputs": [], "source": [ - "task2_model_args = MultiLabelClassificationArgs(num_train_epochs=1,\n", - " no_save=True, \n", - " no_cache=True, \n", - " overwrite_output_dir=True\n", - " )\n", - "task2_model = MultiLabelClassificationModel(\"roberta\", \n", - " 'roberta-base', \n", - " num_labels=7,\n", - " args = task2_model_args, \n", - " use_cuda=cuda_available)\n", - "# train model\n", - "task2_model.train_model(training_set2[['text', 'label']])\n", - "# run predictions\n", - "preds_task2, _ = task2_model.predict(tedf2.text.tolist())" + "print(len(preds_blind_test_set))" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "0211sxhsbbWZ" - }, + "metadata": {}, "outputs": [], "source": [ - "labels2file(preds_task2, 'task2.txt')" + "labels2file([[k] for k in preds_blind_test_set], 'task1.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels2file([[k] for k in preds_task1], 'task1.txt')" ] }, { @@ -1356,28 +1611,20 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "id": "qCjziGtxJRif", - "outputId": "aef99217-9b4d-46f7-f3b9-c9313bc165dc" + "id": "GZDLUcYZbhYg", + "outputId": "7586017d-83f2-4665-eb44-e2264201ac30" }, "outputs": [], "source": [ - "!cat task2.txt | head -n 10" + "!zip submission.zip task1.txt" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GZDLUcYZbhYg", - "outputId": "7586017d-83f2-4665-eb44-e2264201ac30" - }, + "metadata": {}, "outputs": [], - "source": [ - "!zip submission.zip task1.txt task2.txt" - ] + "source": [] } ], "metadata": {