Commit 12c3a07b authored by Ubuntu's avatar Ubuntu

added gaussian processes regression

parent da7a088e
......@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
......@@ -22,13 +22,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import utils.utils as utils\n",
"\n",
"data_dir = \"data/embeddings/new_bert/\"\n",
"data_dir = \"data/embeddings/bert/\"\n",
"train_en_sentence_embs = utils.load_sentence_emb(data_dir + \"train.en.json\")\n",
"train_de_sentence_embs = utils.load_sentence_emb(data_dir + \"train.de.json\")\n",
"\n",
......@@ -41,7 +41,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
......@@ -358,7 +358,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
......@@ -422,32 +422,167 @@
"metadata": {},
"outputs": [],
"source": [
"#Gaussian process experimentation\n",
"data_dir = \"data/embeddings/bert/\"\n",
"train_bert_en = utils.load_sentence_emb(data_dir + \"train.en.json\")\n",
"train_bert_de = utils.load_sentence_emb(data_dir + \"train.de.json\")\n",
"\n",
"val_bert_en = utils.load_sentence_emb(data_dir + \"dev.en.json\")\n",
"val_bert_de = utils.load_sentence_emb(data_dir + \"dev.de.json\")\n",
"\n",
"load_dir = \"data/source_ende\"\n",
"val_scores = utils.load_scores(load_dir, prefix=\"dev\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'train_bert_en' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-1db5454ab720>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgaussian_process\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mgp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mX_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_bert_en\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_bert_de\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0my_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_scores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'train_bert_en' is not defined"
]
}
],
"source": [
"from sklearn.gaussian_process import GaussianProcessRegressor\n",
"from sklearn.gaussian_process.kernels import RBF\n",
"import sklearn.gaussian_process as gp\n",
"\n",
"X_train = np.concatenate((train_bert_en, train_bert_de), axis=1)\n",
"y_train = np.array(train_scores).astype(float)\n",
"\n",
"X_val = np.concatenate((val_bert_en, val_bert_de), axis=1)\n",
"y_val = np.array(val_scores).astype(float)\n",
"\n",
"print(X_train.shape)\n",
"\n",
"\n",
"kernel = RBF(10.0, (1e-3, 1e3))\n",
"\n",
"X_train = np.concatenate((train_en_sentence_embs, train_de_sentence_embs), axis=1)\n",
"model = GaussianProcessRegressor(kernel=kernel)\n",
"model.fit(X_train, y_train)\n",
"params = model.kernel_.get_params()\n",
"\n",
"print(\"Best Parameters for {}\".format(kernel))\n",
"print(params)\n",
"\n",
"y_pred = model.predict(X_val)\n",
"pearson = pearsonr(y_val, y_pred)\n",
"\n",
"print('RMSE: {} Pearson {}'.format(rmse(y_pred, y_val), pearson[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.gaussian_process import GaussianProcessRegressor\n",
"from sklearn.gaussian_process.kernels import *\n",
"import sklearn.gaussian_process as gp\n",
"\n",
"X_train = np.concatenate((train_bert_en, train_bert_de), axis=1)\n",
"y_train = np.array(train_scores).astype(float)\n",
"\n",
"X_val = np.concatenate((val_en_sentence_embs, val_de_sentence_embs), axis=1)\n",
"X_val = np.concatenate((val_bert_en, val_bert_de), axis=1)\n",
"y_val = np.array(val_scores).astype(float)\n",
"\n",
"print(X_train.shape)\n",
"\n",
"for k in [RBF(10, (1e-2, 1e2))]:\n",
" clf_t = GaussianProcessRegressor(kernel=k)\n",
" clf_t.fit(X_train, y_train)\n",
" print(k)\n",
" predictions = clf_t.predict(X_val)\n",
" pearson = pearsonr(y_val, predictions)\n",
" print(f'RMSE: {rmse(predictions,y_val)} Pearson {pearson[0]}')"
"\n",
"kernel = RationalQuadratic()\n",
"\n",
"model = GaussianProcessRegressor(kernel=kernel)\n",
"model.fit(X_train, y_train)\n",
"params = model.kernel_.get_params()\n",
"\n",
"print(\"Best Parameters for {}\".format(kernel))\n",
"print(params)\n",
"\n",
"y_pred = model.predict(X_val)\n",
"pearson = pearsonr(y_val, y_pred)\n",
"\n",
"print('RMSE: {} Pearson {}'.format(rmse(y_pred, y_val), pearson[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.gaussian_process import GaussianProcessRegressor\n",
"from sklearn.gaussian_process.kernels import RBF\n",
"import sklearn.gaussian_process as gp\n",
"\n",
"X_train = np.concatenate((train_bert_en, train_bert_de), axis=1)\n",
"y_train = np.array(train_scores).astype(float)\n",
"\n",
"X_val = np.concatenate((val_bert_en, val_bert_de), axis=1)\n",
"y_val = np.array(val_scores).astype(float)\n",
"\n",
"print(X_train.shape)\n",
"\n",
"kernel = Matern()\n",
"\n",
"model = GaussianProcessRegressor(kernel=kernel)\n",
"model.fit(X_train, y_train)\n",
"params = model.kernel_.get_params()\n",
"\n",
"print(\"Best Parameters for {}\".format(kernel))\n",
"print(params)\n",
"\n",
"y_pred = model.predict(X_val)\n",
"pearson = pearsonr(y_val, y_pred)\n",
"\n",
"print('RMSE: {} Pearson {}'.format(rmse(y_pred, y_val), pearson[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.gaussian_process import GaussianProcessRegressor\n",
"from sklearn.gaussian_process.kernels import *\n",
"import sklearn.gaussian_process as gp\n",
"\n",
"X_train = np.concatenate((train_bert_en, train_bert_de), axis=1)\n",
"y_train = np.array(train_scores).astype(float)\n",
"\n",
"X_val = np.concatenate((val_bert_en, val_bert_de), axis=1)\n",
"y_val = np.array(val_scores).astype(float)\n",
"\n",
"print(X_train.shape)\n",
"\n",
"kernel = DotProduct()\n",
"\n",
"model = GaussianProcessRegressor(kernel=kernel)\n",
"model.fit(X_train, y_train)\n",
"params = model.kernel_.get_params()\n",
"\n",
"print(\"Best Parameters for {}\".format(kernel))\n",
"print(params)\n",
"\n",
"y_pred = model.predict(X_val)\n",
"pearson = pearsonr(y_val, y_pred)\n",
"\n",
"print('RMSE: {} Pearson {}'.format(rmse(y_pred, y_val), pearson[0]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3.5",
"language": "python",
"name": "python3"
},
......@@ -461,7 +596,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.5.6"
}
},
"nbformat": 4,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment