diff --git a/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb b/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb
index 92e9af3949fa77a9252d7f03b79fba2643e2b236..049d8415b540fba32de925ee2d821546e4a130b1 100644
--- a/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb
+++ b/Reconstruct_and_RoBERTa_baseline_train_dev_dataset.ipynb
@@ -84,7 +84,7 @@
     "logging.basicConfig(level=logging.INFO)\n",
     "\n",
     "transformers_logger = logging.getLogger(\"transformers\")\n",
-    "transformers_logger.setLevel(logging.WARNING)\n",
+    "transformers_logger.setLevel(logging.ERROR)\n",
     "\n",
     "# check gpu\n",
     "cuda_available = torch.cuda.is_available()\n",
@@ -273,10 +273,14 @@
     "  # select row from original dataset to retrieve `text` and binary label\n",
     "  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n",
     "  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]\n",
+    "  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]\n",
+    "  country = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].country.values[0]\n",
     "  rows.append({\n",
-    "      'par_id':parid,\n",
+    "      # 'par_id':parid,\n",
     "      'text':text,\n",
-    "      'label':label\n",
+    "      'labels':label,\n",
+    "      'keyword':keyword,\n",
+    "      'country':country\n",
     "  })\n",
     "  "
    ]
@@ -310,6 +314,13 @@
     "trdf1.shape[0]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Frequency of class labels (Q1.1)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -323,7 +334,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Discussion regarding Analysis of class labels**\n",
+    "## **Discussion regarding Analysis of class labels**\n",
     "\n",
     "The dataset is a skewed dataset, with 10 times more sentences not exhibiting pcl compared to sentences exhibiting pcl."
    ]
@@ -478,6 +489,52 @@
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Calculating correlation between input data feature vs label data (Q1.2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trdf1[\"input_length\"] = paragraphs.apply(lambda x : len(x.strip().split(\" \")))\n",
+    "\n",
+    "trdf1.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculating correlation for input data features\n",
+    "# Features include input_length, avg_sentence_length, num_sentences_in_paragraph\n",
+    "\n",
+    "def calculate_corr_score(input_feature):\n",
+    "    return trdf1[\"label\"].corr(trdf1[input_feature])\n",
+    "\n",
+    "corr_lbls_and_input_length = calculate_corr_score(\"input_length\")\n",
+    "corr_lbls_and_avg_sent_len = calculate_corr_score(\"avg_sentence_length\")\n",
+    "corr_num_sentenes_w_lbls = calculate_corr_score(\"num_sentences_in_paragraph\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Correlation score between labels and input length: {corr_lbls_and_input_length}\")\n",
+    "print(f\"Correlation score between labels and avg sentence length: {corr_lbls_and_avg_sent_len}\")\n",
+    "print(f\"Correlation score between labels and num sentences in paragraph: {corr_num_sentenes_w_lbls}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -690,9 +747,11 @@
     "trdf1_copy = trdf1.copy()\n",
     "pat_sent = trdf1.loc[trdf1['label'] == 1]\n",
     "\n",
-    "def create_dataset(extended_dataset):\n",
+    "def create_dataset(extended_dataset, keyword, country):\n",
     "    df = pd.DataFrame(extended_dataset, columns=[\"text\"])\n",
     "    df[\"label\"] = 1\n",
+    "    df[\"keyword\"] = keyword.tolist()\n",
+    "    df[\"country\"] = country.tolist()\n",
     "    return df \n",
     "\n",
     "def extend_dataset_with_embedding_substitution(pat_sent, percentage_of_words):\n",
@@ -727,12 +786,12 @@
     "    pat_num_of_data_augmenting = int(percentage_of_data / 100 * len(pat_sent))\n",
     "\n",
     "    # extract percentage_of_data into non_pat_sent and pat_sent\n",
-    "    # non_pat_sent = non_pat_sent[:non_num_of_data_augmenting]\n",
+    "\n",
     "    pat_sent = pat_sent[:pat_num_of_data_augmenting]\n",
     "\n",
     "    additional_augmented_pat_sent = extend_dataset_with_embedding_substitution(pat_sent['text'], percentage_of_words)\n",
     "\n",
-    "    augmented_pat_sent_df = create_dataset(additional_augmented_pat_sent)\n",
+    "    augmented_pat_sent_df = create_dataset(additional_augmented_pat_sent, pat_sent[\"keyword\"], pat_sent[\"country\"])\n",
     "\n",
     "    len_non_pat_sent = len(pat_sent) * 2\n",
     "\n",
@@ -741,6 +800,15 @@
     "    return augmented_pat_sent_df, additional_non_pat_sent_data"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dpm.train_task1_df.head()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -761,14 +829,18 @@
     "rows = [] # will contain par_id, label and text\n",
     "for idx in range(len(teids)):  \n",
     "  parid = teids.par_id[idx]\n",
-    "  #print(parid)\n",
+    "\n",
     "  # select row from original dataset\n",
     "  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n",
     "  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]\n",
+    "  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]\n",
+    "  country = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].country.values[0] \n",
     "  rows.append({\n",
-    "      'par_id':parid,\n",
+    "      # 'par_id':parid,\n",
     "      'text':text,\n",
-    "      'label':label\n",
+    "      'labels':label,\n",
+    "      'keyword':keyword,\n",
+    "      'country':country\n",
     "  })\n",
     "  "
    ]
@@ -799,6 +871,15 @@
     "tedf1 = pd.DataFrame(rows)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tedf1.head()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -808,17 +889,6 @@
     "# RoBERTa Baseline for Task 1"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# drop irrelevant columns\n",
-    "trdf1 = trdf1.drop(\"par_id\", axis=1)\n",
-    "tedf1 = tedf1.drop(\"par_id\", axis=1)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -832,10 +902,10 @@
    "outputs": [],
    "source": [
     "# downsample negative instances\n",
-    "pcldf = trdf1[trdf1.label==1]\n",
+    "pcldf = trdf1[trdf1.labels==1]\n",
     "npos = len(pcldf)\n",
     "\n",
-    "train_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])"
+    "train_set1 = pd.concat([pcldf,trdf1[trdf1.labels==0][:npos*2]])"
    ]
   },
   {
@@ -851,39 +921,17 @@
    },
    "outputs": [],
    "source": [
-    "non_pat_set1 = train_set1.loc[train_set1['label'] == 0]\n",
+    "non_pat_set1 = train_set1.loc[train_set1['labels'] == 0]\n",
     "non_split = int(0.8 * len(non_pat_set1))\n",
     "non_train, non_val = non_pat_set1[:non_split], non_pat_set1[non_split:]\n",
     "\n",
-    "pat_set1 = train_set1.loc[train_set1['label'] == 1]\n",
+    "pat_set1 = train_set1.loc[train_set1['labels'] == 1]\n",
     "pat_split = int(0.8 * len(pat_set1))\n",
     "pat_train, pat_val = pat_set1[:pat_split], pat_set1[pat_split:]\n",
     "\n",
     "training_set1, val_set1 = pd.concat([non_train, pat_train], ignore_index=True), pd.concat([non_val, pat_val], ignore_index=True)\n",
-    "print(training_set1['label'].value_counts())\n",
-    "print(val_set1['label'].value_counts())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# def precision(tp, fp):\n",
-    "#     return tp / (tp + fp)\n",
-    "\n",
-    "# def recall(tp, fn):\n",
-    "#     return tp / (tp + fn)\n",
-    "\n",
-    "# def f1(precision, recall):\n",
-    "#     return 2 * precision * recall / (precision + recall)\n",
-    "\n",
-    "# p = precision(110, 50)\n",
-    "# r = recall(110, 50)\n",
-    "# f_score = f1(p, r)\n",
-    "\n",
-    "# print(f\"precision: {p}, recall: {r}, f1: {f_score}\")"
+    "print(training_set1['labels'].value_counts())\n",
+    "print(val_set1['labels'].value_counts())"
    ]
   },
   {
@@ -893,7 +941,7 @@
    "outputs": [],
    "source": [
     "def calc_accuracy(preds_task1):\n",
-    "    test_labels = tedf1.label.to_list()\n",
+    "    test_labels = tedf1.labels.to_list()\n",
     "    correct = 0\n",
     "    for i in range(len(preds_task1)):\n",
     "        correct += preds_task1[i] == test_labels[i]\n",
@@ -901,15 +949,19 @@
     "\n",
     "# best hyperparams\n",
     "hyperparams = {\n",
-    "    \"learning_rate\": 0.0005,\n",
+    "    \"learning_rate\": 0.001,\n",
     "    \"train_batch_size\": 16,\n",
-    "    \"num_train_epochs\": 5,\n",
+    "    \"num_train_epochs\": 10,\n",
     "    \"optimizer\": \"Adafactor\",\n",
     "    \"scheduler\":\"linear_schedule_with_warmup\",\n",
     "    \"evaluate_during_training\": True,\n",
-    "    \"evaluate_during_training_steps\": 50,\n",
+    "    \"evaluate_during_training_steps\": 120,\n",
     "    \"adafactor_relative_step\": False,\n",
-    "    \"adafactor_warmup_init\": False\n",
+    "    \"adafactor_warmup_init\": False,\n",
+    "    \"use_early_stopping\": True,\n",
+    "    \"early_stopping_delta\": 0.01,\n",
+    "    \"loss_type\": \"focal\"\n",
+    "    # \"early_stopping_consider_epochs\":True\n",
     "}\n",
     "\n",
     "def preprocess(data, use_synonyms=False, use_embedding=False, use_translate=False, word_percent=20, data_precent=20):\n",
@@ -933,7 +985,7 @@
     "                                        no_save=True, \n",
     "                                        no_cache=True, \n",
     "                                        overwrite_output_dir=True,\n",
-    "                                        logging_steps=50,\n",
+    "                                        logging_steps=120,\n",
     "                                        **hyperparams)\n",
     "    task1_model = ClassificationModel(model, \n",
     "                                    model_name, \n",
@@ -944,21 +996,14 @@
     "    \n",
     "    # train model\n",
     "    train = preprocess(train, use_synonyms, use_embedding, use_translate, word_percent, data_percent)\n",
-    "    task1_model.train_model(train[['text', 'label']], eval_df=val[['text', 'label']])\n",
+    "    task1_model.train_model(train[['text', 'labels', 'keyword', 'country']], eval_df=val[['text', 'labels', 'keyword', 'country']])\n",
     "    return task1_model\n",
     "\n",
     "def test_model(task1_model):\n",
     "    # run predictions\n",
+    "    \n",
     "    preds_task1, _ = task1_model.predict(tedf1.text.tolist())\n",
-    "    return preds_task1\n",
-    "\n",
-    "# hyperparams[\"train_batch_size\"] = bs \n",
-    "# task1_model = train_model(\"roberta\", \"roberta-base\", training_set1, val_set1, hyperparams)\n",
-    "# preds_task1 = test_model(task1_model)\n",
-    "# # all_preds.append(preds_task1)\n",
-    "# del task1_model\n",
-    "# torch.cuda.empty_cache()\n",
-    "# torch.cuda.synchronize()"
+    "    return preds_task1\n"
    ]
   },
   {
@@ -972,6 +1017,112 @@
     "training_set1_lower_case['text'] = training_set1['text'].apply(lambda x: x.lower())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def precision(tp, fp):\n",
+    "    try:\n",
+    "        val = tp / (tp + fp)\n",
+    "        return val\n",
+    "    except:\n",
+    "        return 0\n",
+    "\n",
+    "def recall(tp, fn):\n",
+    "    try:\n",
+    "        val = tp / (tp + fn)\n",
+    "        return val\n",
+    "    except:\n",
+    "        return 0\n",
+    "\n",
+    "def f1(precision, recall):\n",
+    "    if precision == 0 and recall == 0:\n",
+    "        return 1\n",
+    "    elif precision == 0 or recall == 0:\n",
+    "        return 0\n",
+    "    return 2 * precision * recall / (precision + recall)\n",
+    "\n",
+    "def get_tp(preds_data, true_labels):\n",
+    "    correct = 0\n",
+    "    for i in range(len(preds_data)):\n",
+    "        correct += preds_data[i] == true_labels[i] and true_labels[i] == 1\n",
+    "    return correct\n",
+    "\n",
+    "def get_fp(preds_data, true_labels):\n",
+    "    correct = 0\n",
+    "    for i in range(len(preds_data)):\n",
+    "        correct += preds_data[i] == 1 and true_labels[i] == 0\n",
+    "    return correct\n",
+    "\n",
+    "def get_tn(preds_data, true_labels):\n",
+    "    correct = 0\n",
+    "    for i in range(len(preds_data)):\n",
+    "        correct += preds_data[i] == true_labels[i] and true_labels[i] == 0\n",
+    "    return correct\n",
+    "\n",
+    "def get_fn(preds_data, true_labels):\n",
+    "    correct = 0\n",
+    "    for i in range(len(preds_data)):\n",
+    "        correct += preds_data[i] == 0 and true_labels[i] == 1\n",
+    "    return correct\n",
+    "\n",
+    "def eval_metrics(preds_data, true_labels):\n",
+    "\n",
+    "    tp = get_tp(preds_data, true_labels)\n",
+    "    fp = get_fp(preds_data, true_labels)\n",
+    "    tn = get_tn(preds_data, true_labels)\n",
+    "    fn = get_fn(preds_data, true_labels)\n",
+    "\n",
+    "    p = precision(tp, fp)\n",
+    "    r = recall(tp, fn)\n",
+    "    \n",
+    "    # print(f\"Precision: {p}, Recall: {r}\")\n",
+    "\n",
+    "    f1_score = f1(p, r)\n",
+    "    # print(f\"f1: {f1_score}\")\n",
+    "\n",
+    "\n",
+    "    return p, r, f1_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "task1_model = train_model(\"roberta\", \"roberta-base\",\n",
+    "    training_set1_lower_case, val_set1, hyperparams)\n",
+    "preds_task1 = test_model(task1_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"accuracy: {calc_accuracy(preds_task1)}\")\n",
+    "test_labels = tedf1.labels.to_list()\n",
+    "eval_metrics(preds_task1, test_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# best f1 so far is 0.5335 smth on lr of 0.001\n",
+    "# bert cased and uncased dont matter\n",
+    "\n",
+    "#lr = 0.001, 1e-3\n",
+    "#lr = 0.0005 5e-4\n",
+    "#lr = 0.002 2e-3"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -994,7 +1145,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Original training"
+    "## Original training"
    ]
   },
   {
@@ -1038,7 +1189,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Submission"
+    "## Analysis Questions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Q2"
    ]
   },
   {
@@ -1047,282 +1205,379 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "labels2file([[k] for k in preds_task1], 'task1.txt')"
+    "preds_test, _ = task1_model.predict(tedf1.text.tolist())"
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "k7Cc_u5Oli7j"
-   },
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "# Rebuild training set (Task 2)"
+    "tedf1[\"input_length\"] = paragraphs.apply(lambda x : len(x.strip().split(\" \")))\n",
+    "tedf1[\"input_length_bin_label\"] = tedf1[\"input_length\"].apply(lambda x : int(x / 5))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "D2WLYT7wli7k"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "rows2 = [] # will contain par_id, label and text\n",
-    "for idx in range(len(trids)):  \n",
-    "  parid = trids.par_id[idx]\n",
-    "  label = trids.label[idx]\n",
-    "  # select row from original dataset to retrieve the `text` value\n",
-    "  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n",
-    "  rows2.append({\n",
-    "      'par_id':parid,\n",
-    "      'text':text,\n",
-    "      'label':label\n",
-    "  })\n",
-    "  "
+    "tedf1[\"predicted_labels\"] = preds_test"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "LFqMMb5Jli7l"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "trdf2 = pd.DataFrame(rows2)"
+    "tedf1.head()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 422
-    },
-    "id": "HayrC9q7mQPl",
-    "outputId": "db5f1bdf-c09a-4a57-f81e-612100e32b44"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "trdf2"
+    "# def precision(tp, fp):\n",
+    "#     try:\n",
+    "#         val = tp / (tp + fp)\n",
+    "#         return val\n",
+    "#     except:\n",
+    "#         return 0\n",
+    "\n",
+    "# def recall(tp, fn):\n",
+    "#     try:\n",
+    "#         val = tp / (tp + fn)\n",
+    "#         return val\n",
+    "#     except:\n",
+    "#         return 0\n",
+    "\n",
+    "# def f1(precision, recall):\n",
+    "#     if precision == 0 and recall == 0:\n",
+    "#         return 1\n",
+    "#     elif precision == 0 or recall == 0:\n",
+    "#         return 0\n",
+    "#     return 2 * precision * recall / (precision + recall)\n",
+    "\n",
+    "# # p = precision(110, 50)\n",
+    "# # r = recall(110, 50)\n",
+    "# # f_score = f1(p, r)\n",
+    "\n",
+    "# # print(f\"precision: {p}, recall: {r}, f1: {f_score}\")\n",
+    "\n",
+    "# def get_tp(preds_data, true_labels):\n",
+    "#     correct = 0\n",
+    "#     for i in range(len(preds_data)):\n",
+    "#         correct += preds_data[i] == true_labels[i] and true_labels[i] == 1\n",
+    "#     return correct\n",
+    "\n",
+    "# def get_fp(preds_data, true_labels):\n",
+    "#     correct = 0\n",
+    "#     for i in range(len(preds_data)):\n",
+    "#         correct += preds_data[i] == 1 and true_labels[i] == 0\n",
+    "#     return correct\n",
+    "\n",
+    "# def get_tn(preds_data, true_labels):\n",
+    "#     correct = 0\n",
+    "#     for i in range(len(preds_data)):\n",
+    "#         correct += preds_data[i] == true_labels[i] and true_labels[i] == 0\n",
+    "#     return correct\n",
+    "\n",
+    "# def get_fn(preds_data, true_labels):\n",
+    "#     correct = 0\n",
+    "#     for i in range(len(preds_data)):\n",
+    "#         correct += preds_data[i] == 0 and true_labels[i] == 1\n",
+    "#     return correct\n",
+    "\n",
+    "# def eval_metrics(preds_data, true_labels):\n",
+    "\n",
+    "#     tp = get_tp(preds_data, true_labels)\n",
+    "#     fp = get_fp(preds_data, true_labels)\n",
+    "#     tn = get_tn(preds_data, true_labels)\n",
+    "#     fn = get_fn(preds_data, true_labels)\n",
+    "\n",
+    "#     p = precision(tp, fp)\n",
+    "#     r = recall(tp, fn)\n",
+    "    \n",
+    "#     # print(f\"Precision: {p}, Recall: {r}\")\n",
+    "\n",
+    "#     f1_score = f1(p, r)\n",
+    "#     # print(f\"f1: {f1_score}\")\n",
+    "\n",
+    "\n",
+    "#     return p, r, f1_score"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "MxHLB_g0pfEb"
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#we should calculate classification metrics on sentences with input_length that are in bins of 5\n",
+    "# so we need to get indices of sentences with 0-5, 5-10, 10-15, 15-20, ......\n",
+    "# calculate f1 metrics on them\n",
+    "\n",
+    "\n",
+    "unique_bin_labels = tedf1[\"input_length_bin_label\"].unique()\n",
+    "\n",
+    "unique_bin_labels = sorted(unique_bin_labels)\n",
+    "\n",
+    "\n",
+    "classification_metrics = []\n",
+    "\n",
+    "print(unique_bin_labels)\n",
+    "\n",
+    "for bin_label in unique_bin_labels:\n",
+    "    print(bin_label)\n",
+    "    idx_of_bin_label = tedf1.index[tedf1[\"input_length_bin_label\"] == bin_label].tolist()\n",
+    "    inputs_in_bin = tedf1.iloc[idx_of_bin_label]\n",
+    "    pred_labels = inputs_in_bin[\"predicted_labels\"].tolist()\n",
+    "    # print(f\"pred labels = {pred_labels}\")\n",
+    "    actual_labels = inputs_in_bin[\"label\"].tolist()\n",
+    "    # print(f\"actual labels: {actual_labels}\")\n",
+    "    p, r, f1_score = eval_metrics(pred_labels, actual_labels)\n",
+    "    classification_metrics.append((bin_label, p, r, f1_score))\n",
+    "\n",
+    "classification_metrics_df = pd.DataFrame(classification_metrics, columns=[\"bin_label\", \"precision\", \"recall\", \"f1\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "trdf2.label = trdf2.label.apply(literal_eval)"
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bin_labels = classification_metrics_df[\"bin_label\"]\n",
+    "precision_scores = classification_metrics_df[\"precision\"]\n",
+    "recall_scores = classification_metrics_df[\"recall\"]\n",
+    "f1_scores = classification_metrics_df[\"f1\"]\n",
+    "\n",
+    "def plot_metric(xvals, yvals, title):\n",
+    "    plt.plot(xvals, yvals)\n",
+    "    plt.title(title)\n",
+    "    plt.show()\n",
+    "\n",
+    "plot_metric(bin_labels, precision_scores, \"Bin labels aganist Precision\")\n",
+    "plot_metric(bin_labels, recall_scores, \"Bin labels aganist recall\")\n",
+    "plot_metric(bin_labels, f1_scores, \"Bin labels aganist f1\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "id": "Gukbmv0bli7l"
-   },
+   "metadata": {},
    "source": [
-    "# Rebuild test set (Task 2)"
+    "### Q3"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "gjH-AJK1li7m"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "rows2 = [] # will contain par_id, label and text\n",
-    "for idx in range(len(teids)):  \n",
-    "  parid = teids.par_id[idx]\n",
-    "  label = teids.label[idx]\n",
-    "  #print(parid)\n",
-    "  # select row from original dataset to access the `text` value\n",
-    "  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]\n",
-    "  rows2.append({\n",
-    "      'par_id':parid,\n",
-    "      'text':text,\n",
-    "      'label':label\n",
-    "  })\n",
-    "  "
+    "unique_keywords = tedf1[\"keyword\"].unique()\n",
+    "\n",
+    "unique_keywords_metrics = []\n",
+    "for keyword in unique_keywords:\n",
+    "    rows_with_keyword = tedf1[tedf1[\"keyword\"] == keyword]\n",
+    "    pred_labels = rows_with_keyword[\"predicted_labels\"].tolist()\n",
+    "    actual_labels = rows_with_keyword[\"labels\"].tolist()\n",
+    "    p, r, f1_score = eval_metrics(pred_labels, actual_labels)\n",
+    "    unique_keywords_metrics.append((keyword, p, r, f1_score))\n",
+    "\n",
+    "unique_keyword_metrics_df = pd.DataFrame(unique_keywords_metrics, columns=[\"keyword\", \"precision\", \"recall\", \"f1\"])\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "SRP-tn5wli7n"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "tedf2 = pd.DataFrame(rows2)"
+    "unique_countries = tedf1[\"country\"].unique()\n",
+    "\n",
+    "unique_country_metrics = []\n",
+    "for country in unique_countries:\n",
+    "    rows_with_country = tedf1[tedf1[\"country\"] == country]\n",
+    "    pred_labels = rows_with_country[\"predicted_labels\"].tolist()\n",
+    "    actual_labels = rows_with_country[\"labels\"].tolist()\n",
+    "    p, r, f1_score = eval_metrics(pred_labels, actual_labels)\n",
+    "    unique_country_metrics.append((country, p, r, f1_score)) \n",
+    "\n",
+    "unique_country_metrics_df = pd.DataFrame(unique_country_metrics, columns=[\"country\", \"precision\", \"recall\", \"f1\"])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 422
-    },
-    "id": "8U2lrfJiolku",
-    "outputId": "6bf1181c-3e95-4913-cceb-9cc9e08b6c29"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "tedf2"
+    "!pip install seaborn\n",
+    "import seaborn as sns"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "81aQFjWqpbe2"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "tedf2.label = tedf2.label.apply(literal_eval)"
+    "country_labels = unique_country_metrics_df[\"country\"]\n",
+    "country_precision_scores = unique_country_metrics_df[\"precision\"]\n",
+    "country_recall_scores = unique_country_metrics_df[\"recall\"]\n",
+    "country_f1_scores = unique_country_metrics_df[\"f1\"]\n",
+    "\n",
+    "def plot_metrics_for_categorical_labels(xlabels, ylabels, title):\n",
+    "    plt.figure(figsize=(10, 8))\n",
+    "    x = range(len(ylabels))\n",
+    "    plt.scatter(x, ylabels)\n",
+    "    # plt.hist(ylabels, len(ylabels) - 1)\n",
+    "    x_ticks = xlabels\n",
+    "    plt.xticks(x, x_ticks)\n",
+    "    plt.title(title)\n",
+    "    plt.show()\n",
+    "    # fig, ax = plt.subplots()\n",
+    "    # ax.bar(ylabels, len(ylabels) - 1, width=0.8, align='center')\n",
+    "    # ax.set(xticks=xlabels)\n",
+    "    # plt.plot()\n",
+    "\n",
+    "plot_metrics_for_categorical_labels(country_labels, country_precision_scores, \"Country labels aganist Precision\")\n",
+    "plot_metrics_for_categorical_labels(country_labels, country_recall_scores, \"Country labels aganist Recall\")\n",
+    "plot_metrics_for_categorical_labels(country_labels, country_f1_scores, \"Country labels aganist F1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "keyword_labels = unique_keyword_metrics_df[\"keyword\"]\n",
+    "keyword_precision_scores = unique_keyword_metrics_df[\"precision\"]\n",
+    "keyword_recall_scores = unique_keyword_metrics_df[\"recall\"]\n",
+    "keyword_f1_scores = unique_keyword_metrics_df[\"f1\"]\n",
+    "\n",
+    "plot_metrics_for_categorical_labels(keyword_labels, keyword_precision_scores, \"Keyword labels aganist Precision\")\n",
+    "plot_metrics_for_categorical_labels(keyword_labels, keyword_recall_scores, \"Keyword labels aganist recall\")\n",
+    "plot_metrics_for_categorical_labels(keyword_labels, keyword_f1_scores, \"Keyword labels aganist f1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tedf1.head()"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {
-    "id": "YKFiVaslbAiC"
-   },
+   "metadata": {},
    "source": [
-    "# RoBERTa baseline for Task 2"
+    "# Submission"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Rebuild codalab test set"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "hmr5ZZf5Ik5T"
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df = pd.read_csv(\"./test_set.tsv\", delimiter=\"\\t\", header=None)\n",
+    "print(len(test_df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df.columns = [\"par_id\", \"art_id\", \"keyword\", \"country\", \"text\"]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df.to_csv(\"./test_set.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "all_negs = trdf2[trdf2.label.apply(lambda x:sum(x) == 0)]\n",
-    "all_pos = trdf2[trdf2.label.apply(lambda x:sum(x) > 0)]\n",
+    "codalab_test_df = pd.read_csv(\"./test_set.csv\") \n",
+    "\n",
+    "codalab_test_df_paras = codalab_test_df[\"text\"].to_list()\n",
     "\n",
-    "training_set2 = pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]])"
+    "print(len(codalab_test_df[\"text\"]))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 422
-    },
-    "id": "zyBcJoHtJHE2",
-    "outputId": "983b27a6-3bec-47bc-e564-79face4b061c"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "training_set2"
+    "#submission to codalab\n",
+    "preds_blind_test_set, _ = task1_model.predict(codalab_test_df_paras)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 379,
-     "referenced_widgets": [
-      "22e0c7f26e6e462ba7f63d02ed4cc1f0",
-      "4c58352e4a8a4836ad9d90b68876cadc",
-      "0649eb3fbea04d98afd7241a9f175dd1",
-      "36909c71cfeb44c9aa48c96b0724d5a0",
-      "d447cc0c53974205ade693d0eff0bc7b",
-      "5cb366b5f6384c408e0b6c8c8e18ca67",
-      "22363e506fe341e3a59dfea0a972affb",
-      "1143e9991f3e443a89605fcc72fae27b",
-      "c88d5b03d6d9498ab972678dedb018bf",
-      "d79ae039a52a43c0a20ff96b2d32607b",
-      "05826d440af54576a054d0d8e0b212f6",
-      "f1e8ea21a9e6457e8dcd35b21be75b22",
-      "1400f1d23b19496ab51b253b5f87298b",
-      "5ef5f967ffff4b9689f6f72772e1d9fd",
-      "e59cd0b3c86e4be28ae6442694b22f9c",
-      "0ac5312eac44449c8b72030f0d064383",
-      "d12ad49c9fc54f86affd96e975b93186",
-      "e98cda63a023489cae4ada214ba5bb38",
-      "379253e799eb4c9b92342abeffd9b4a2",
-      "448426e53b934f8d8cfc96a94f95e4b1",
-      "89e65dddb1884e158cb3d1937843ccab",
-      "9e3eb217f2c349cf8a2b255ca5a34622",
-      "7473c3bed5854eacbc818363f4a1ab9a",
-      "04ae2805168b48da9953754e0989f073",
-      "7de6deb7064344faa9e4104be09bc1b6",
-      "dd69b800241e4bb3a856ecd46968b762",
-      "2ea5a9a7f9a44295ad5364f47926b9d2",
-      "5d195a90ff5143a998c4cb9ca851811b",
-      "c4425a4579084d37b000192df67e3ae1",
-      "e4508e3bbceb40538e0f01a40da539de",
-      "d7a3b9eef0e347278563217103b4253c",
-      "a23c1b483a9e4aca8dc12da65527b103",
-      "dc4ab3d0b89440a38725e54852db46df",
-      "31e1a0a3fe2f4228887c1abc7443e8ea",
-      "f107664ab872463fa5dbee7bcfe1a866",
-      "e778f7c68ce54014a5bfb3050e72292a",
-      "8ecaba0474a64ea3a4a400107f08b90e",
-      "3bedc272418949e8bb2897bcfae0a36b",
-      "5d63fdd645634af381db3abc93ce2b24",
-      "afd3d0f6506e464c98b9ad57a24c3e9d",
-      "08263433f8b6426693653ab9955e5410",
-      "bdda742fee1148d581d8a15e8a560869",
-      "c4dbb10f007f483a950a95555583dc2a",
-      "d66f8f8d82f9499488b6ffa16193b320",
-      "25fdc4ac2d714e5ba99b4f297726d36c",
-      "5e5ac5500b974cfcbf76d2e5d70adb3d",
-      "e4ccd1d746ab4f1f826477bb021c34a8",
-      "16d4a580157446daaa4f721d28d095fa",
-      "ec1e8ea22a8f4272b258fa19a2c3ec83",
-      "fba89fa62d6a413b90f3961755794a61",
-      "22f9701e81034b5ea8d1bdd38ab0d191",
-      "95392827ab8149388dd9af86d1fb03a1",
-      "92b7ffe0dd1b49d1acdb2b588f72569d",
-      "afd5528b85874795ac1d736dde15b810",
-      "99a8a52b7dfb4fa994a4b09feba8fe58"
-     ]
-    },
-    "id": "ECb7_mwzbFa6",
-    "outputId": "d433e1aa-c3c4-4a00-9b2a-c04ccdc20588"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "task2_model_args = MultiLabelClassificationArgs(num_train_epochs=1,\n",
-    "                                                no_save=True, \n",
-    "                                                no_cache=True, \n",
-    "                                                overwrite_output_dir=True\n",
-    "                                                )\n",
-    "task2_model = MultiLabelClassificationModel(\"roberta\", \n",
-    "                                            'roberta-base', \n",
-    "                                            num_labels=7,\n",
-    "                                            args = task2_model_args, \n",
-    "                                            use_cuda=cuda_available)\n",
-    "# train model\n",
-    "task2_model.train_model(training_set2[['text', 'label']])\n",
-    "# run predictions\n",
-    "preds_task2, _ = task2_model.predict(tedf2.text.tolist())"
+    "print(len(preds_blind_test_set))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "0211sxhsbbWZ"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "labels2file(preds_task2, 'task2.txt')"
+    "labels2file([[k] for k in preds_blind_test_set], 'task1.txt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels2file([[k] for k in preds_task1], 'task1.txt')"
    ]
   },
   {
@@ -1356,28 +1611,20 @@
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "qCjziGtxJRif",
-    "outputId": "aef99217-9b4d-46f7-f3b9-c9313bc165dc"
+    "id": "GZDLUcYZbhYg",
+    "outputId": "7586017d-83f2-4665-eb44-e2264201ac30"
    },
    "outputs": [],
    "source": [
-    "!cat task2.txt | head -n 10"
+    "!zip submission.zip task1.txt"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "GZDLUcYZbhYg",
-    "outputId": "7586017d-83f2-4665-eb44-e2264201ac30"
-   },
+   "metadata": {},
    "outputs": [],
-   "source": [
-    "!zip submission.zip task1.txt task2.txt"
-   ]
+   "source": []
   }
  ],
  "metadata": {