From e7095c93cec15fb53f78c587f4fdec0fef3e8fb7 Mon Sep 17 00:00:00 2001
From: Winston <43570913+gumdropsteve@users.noreply.github.com>
Date: Fri, 2 Aug 2019 03:58:44 -0700
Subject: [PATCH] Add mortgage data download

Added mortgage data download from rapidsai GitHub (notebooks-extended/data); rerun start to finish for updated results with downloaded data instead of random data. MSE was severely lowered for both sklearn and cuML.

Aesthetic adjustments include: merge Helper Function defining to 1 cell from 2; break down paragraphs into easy to follow bullet points
---
 colab_notebooks/cuml/sgd_demo.ipynb | 958 ++++++++++++++--------------
 1 file changed, 493 insertions(+), 465 deletions(-)

diff --git a/colab_notebooks/cuml/sgd_demo.ipynb b/colab_notebooks/cuml/sgd_demo.ipynb
index 23406789..552fa7ad 100644
--- a/colab_notebooks/cuml/sgd_demo.ipynb
+++ b/colab_notebooks/cuml/sgd_demo.ipynb
@@ -1,486 +1,514 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "hGL2yxA7Cw41"
-   },
-   "source": [
-    "# Stochastic gradient descent (SGD) \n",
-    "SGD is an  incremental gradient descent algorithm which modifies its weights, in an effort to reach a local minimum. \n",
-    "\n",
-    "The cuML implementation takes only numpy arrays and cuDF datasets as inputs. \n",
-    "\n",
-    "- In order to convert your dataset into a cuDF dataframe format please refer the [cuDF documentation](https://rapidsai.github.io/projects/cudf/en/latest/)   \n",
-    "\n",
-    "The SGD algorithm implemented in cuML can accept the following parameters:\n",
-    "1. `loss` : 'hinge', 'log', 'squared_loss' (default = 'squared_loss')\n",
-    "2. `penalty`: 'none', 'l1', 'l2', 'elasticnet' (default = 'none')\n",
-    "3. `alpha`: float (default = 0.0001)\n",
-    "4. `fit_intercept` : boolean (default = True)\n",
-    "5. `epochs` : int (default = 1000)\n",
-    "6. `tol` : float (default = 1e-3)\n",
-    "7. `shuffle` : boolean (default = True)\n",
-    "8. `eta0` : float (default = 0.0)\n",
-    "9. `power_t` : float (default = 0.5)\n",
-    "10. `learning_rate` : 'optimal', 'constant', 'invscaling', 'adaptive' (default = 'constant')\n",
-    "11. `n_iter_no_change` : int (default = 5)\n",
-    "\n",
-    "For additional information on the SGD model please refer to the [cuML documentation](https://rapidsai.github.io/projects/cuml/en/latest/index.html)\n",
-    "- this setup may take a few minutes\n",
-    "- long output (output display removed)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "aVB1xbBuDBeY"
-   },
-   "outputs": [],
-   "source": [
-    "!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh\n",
-    "!bash rapids-colab.sh\n",
-    "\n",
-    "import sys, os\n",
-    "\n",
-    "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
-    "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
-    "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "7wrS9WpmCw42"
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import cudf\n",
-    "from cuml.solvers import SGD as cumlSGD\n",
-    "from sklearn.linear_model import SGDRegressor"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "SHiYHcOmCw46"
-   },
-   "source": [
-    "# Helper Functions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "zNUHUf93Cw47"
-   },
-   "outputs": [],
-   "source": [
-    "# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for sgd \n",
-    "import gzip\n",
-    "# change the path of the mortgage dataset if you have saved it in a different directory\n",
-    "def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):\n",
-    "    if os.path.exists(cached):\n",
-    "        print('use mortgage data')\n",
-    "\n",
-    "        with gzip.open(cached) as f:\n",
-    "            X = np.load(f)\n",
-    "        # the 4th column is 'adj_remaining_months_to_maturity'\n",
-    "        # used as the label\n",
-    "        X = X[:,[i for i in range(X.shape[1]) if i!=4]]\n",
-    "        y = X[:,4:5]\n",
-    "        rindices = np.random.randint(0,X.shape[0]-1,nrows)\n",
-    "        X = X[rindices,:ncols]\n",
-    "        y = y[rindices]\n",
-    "\n",
-    "    else:\n",
-    "        # create a random dataset\n",
-    "        print('use random data')\n",
-    "        X = np.random.rand(nrows,ncols)\n",
-    "        y = np.random.randint(0,10,size=(nrows,1))\n",
-    "    train_rows = int(nrows*0.8)\n",
-    "    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})\n",
-    "    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})\n",
-    "    df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})\n",
-    "    df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})\n",
-    "    return df_X_train, df_X_test, df_y_train, df_y_test\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "lmIpjEzBCw49"
-   },
-   "outputs": [],
-   "source": [
-    "# this function checks if the results obtained from two different methods (sklearn and cuml) are the same\n",
-    "from sklearn.metrics import mean_squared_error\n",
-    "def array_equal(a,b,threshold=2e-3,with_sign=True):\n",
-    "    a = to_nparray(a).ravel()\n",
-    "    b = to_nparray(b).ravel()\n",
-    "    if with_sign == False:\n",
-    "        a,b = np.abs(a),np.abs(b)\n",
-    "    error = mean_squared_error(a,b)\n",
-    "    res = error<threshold\n",
-    "    return res\n",
-    "\n",
-    "# the function converts a variable from ndarray or dataframe format to numpy array\n",
-    "def to_nparray(x):\n",
-    "    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):\n",
-    "        return np.array(x)\n",
-    "    elif isinstance(x,np.float64):\n",
-    "        return np.array([x])\n",
-    "    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):\n",
-    "        return x.to_pandas().values\n",
-    "    return x\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "O8-VlB1JCw5A"
-   },
-   "source": [
-    "# Run tests"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
     "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 136
+      "name": "sgd_demo.ipynb",
+      "version": "0.3.2",
+      "provenance": []
     },
-    "colab_type": "code",
-    "id": "favFgpCVCw5B",
-    "outputId": "df11635f-b5fe-4ead-d6d0-1cdcba6b2ef7"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "use random data\n",
-      "training data (838860, 399)\n",
-      "training label (838860, 1)\n",
-      "testing data (209716, 399)\n",
-      "testing label (209716, 1)\n",
-      "CPU times: user 13.7 s, sys: 3.7 s, total: 17.4 s\n",
-      "Wall time: 17.4 s\n"
-     ]
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.7.2"
     }
-   ],
-   "source": [
-    "%%time\n",
-    "# nrows = number of samples\n",
-    "# ncols = number of features of each sample\n",
-    "nrows = 2**20\n",
-    "ncols = 399\n",
-    "\n",
-    "# dataset is split into a ratio of 80:20, \n",
-    "# 80% is used as the training data and the remaining 20% is used as the test data\n",
-    "X_train, X_test, y_train, y_test = load_data(nrows,ncols)\n",
-    "y_train_ser = y_train['fea0']\n",
-    "print('training data',X_train.shape)\n",
-    "print('training label',y_train.shape)\n",
-    "print('testing data',X_test.shape)\n",
-    "print('testing label',y_test.shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "w_exXx_UCw5G"
-   },
-   "source": [
-    "Here we set the parameters usedby both libraries. You can change the number of iterations used by changing the `iterations` variable.  Please note that making this too high can cause the functions to take a long time to complete."
-   ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "DTT1OGzxCw5H"
-   },
-   "outputs": [],
-   "source": [
-    "#set parameters \n",
-    "learning_rate = 'adaptive'\n",
-    "datatype = np.float32\n",
-    "penalty = 'elasticnet'\n",
-    "loss = 'squared_loss'\n",
-    "iterations = 10 "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "uymvkHgWCw5K"
-   },
-   "source": [
-    "The `max_iter` parameter controls the maxixmum number of iterations the model can run for but it doesn’t guarantee that the model will definitely run for all those epochs, therefore the sklearn might run for less number of epochs than the cuML model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 105
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "hGL2yxA7Cw41"
+      },
+      "source": [
+        "# Stochastic gradient descent (SGD) \n",
+        "SGD is an  incremental gradient descent algorithm which modifies its weights, in an effort to reach a local minimum. \n",
+        "\n",
+        "The cuML implementation takes only numpy arrays and cuDF datasets as inputs. \n",
+        "\n",
+        "- In order to convert your dataset into a cuDF dataframe format please refer the [cuDF documentation](https://rapidsai.github.io/projects/cudf/en/latest/)   \n",
+        "\n",
+        "The SGD algorithm implemented in cuML can accept the following parameters:\n",
+        "1. `loss` : 'hinge', 'log', 'squared_loss' (default = 'squared_loss')\n",
+        "2. `penalty`: 'none', 'l1', 'l2', 'elasticnet' (default = 'none')\n",
+        "3. `alpha`: float (default = 0.0001)\n",
+        "4. `fit_intercept` : boolean (default = True)\n",
+        "5. `epochs` : int (default = 1000)\n",
+        "6. `tol` : float (default = 1e-3)\n",
+        "7. `shuffle` : boolean (default = True)\n",
+        "8. `eta0` : float (default = 0.0)\n",
+        "9. `power_t` : float (default = 0.5)\n",
+        "10. `learning_rate` : 'optimal', 'constant', 'invscaling', 'adaptive' (default = 'constant')\n",
+        "11. `n_iter_no_change` : int (default = 5)\n",
+        "\n",
+        "For additional information on the SGD model please refer to the [cuML documentation](https://rapidsai.github.io/projects/cuml/en/latest/index.html)\n",
+        "\n",
+        "### Install RAPIDS AI\n",
+        "- this setup may take a few minutes\n",
+        "- long output (output display removed)"
+      ]
     },
-    "colab_type": "code",
-    "id": "FvZdm9lWCw5L",
-    "outputId": "29f1dc84-1c65-473c-9ef3-18379135bec8"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 44 s, sys: 36.2 ms, total: 44 s\n",
-      "Wall time: 44 s\n"
-     ]
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "aVB1xbBuDBeY",
+        "colab": {}
+      },
+      "source": [
+        "!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh\n",
+        "!bash rapids-colab.sh\n",
+        "\n",
+        "import sys, os\n",
+        "\n",
+        "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
+        "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
+        "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/stochastic_gradient.py:1185: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.\n",
-      "  ConvergenceWarning)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "# use the sklearn SGD Regressor model to fit the dataset \n",
-    "sk_sgd = SGDRegressor(learning_rate=learning_rate, eta0=0.07,\n",
-    "                       max_iter=iterations, tol=0.0, fit_intercept=True,\n",
-    "                       penalty=penalty, loss=loss)\n",
-    "sk_sgd.fit(X_train, y_train_ser)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 51
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HiUhvosywyCN",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Download Mortgage Data"
+      ]
     },
-    "colab_type": "code",
-    "id": "EX8-fUSPCw5N",
-    "outputId": "86d8f6b2-f015-4884-d653-8d7bea1a3b69"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 206 ms, sys: 5.99 ms, total: 212 ms\n",
-      "Wall time: 151 ms\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "# test the model by predicting its results for the unseen test set\n",
-    "y_sk = sk_sgd.predict(X_test)\n",
-    "\n",
-    "# calculate the Mean Squared Error for the model's predictions\n",
-    "error_sk = mean_squared_error(y_test,y_sk)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 51
+      "cell_type": "code",
+      "metadata": {
+        "id": "3c9mazwxwIsX",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# downloading data & storing in data/mortgage directory (output display removed)\n",
+        "!mkdir -p data/mortgage/ && wget  -O data/mortgage/mortgage.npy.gz https://github.com/rapidsai/notebooks-extended/blob/master/data/mortgage/mortgage.npy.gz?raw=true"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
-    "colab_type": "code",
-    "id": "baA0XjYgCw5T",
-    "outputId": "eee16801-ae24-4026-c93c-ff802489c13e"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 2.02 s, sys: 782 ms, total: 2.8 s\n",
-      "Wall time: 2.71 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "# convert the pandas dataframe to cuDF dataframe and series\n",
-    "X_cudf = cudf.DataFrame.from_pandas(X_train)\n",
-    "X_cudf_test = cudf.DataFrame.from_pandas(X_test)\n",
-    "y_cudf = cudf.Series(y_train_ser)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 51
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "m1CyUiiYv-Km",
+        "colab_type": "text"
+      },
+      "source": [
+        "### Imports"
+      ]
     },
-    "colab_type": "code",
-    "id": "ku8uGfhHCw5W",
-    "outputId": "1baef0ff-5993-40ab-d29f-e5a808a323c7"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 3.41 s, sys: 1.18 s, total: 4.6 s\n",
-      "Wall time: 4.59 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "# fit the training data on cuML's implementation of SGD\n",
-    "cu_sgd = cumlSGD(learning_rate=learning_rate, eta0=0.07, epochs=iterations, #epochs == n_iter\n",
-    "                 batch_size=512,\n",
-    "                 tol=0.0, penalty=penalty, loss=loss)\n",
-    "cu_sgd.fit(X_cudf, y_cudf)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 51
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "7wrS9WpmCw42",
+        "colab": {}
+      },
+      "source": [
+        "# gpu \n",
+        "import cudf\n",
+        "import cupy\n",
+        "from cuml.solvers import SGD as cumlSGD\n",
+        "# cpu\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "from sklearn.linear_model import SGDRegressor"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
-    "colab_type": "code",
-    "id": "lj86ouxvCw5a",
-    "outputId": "81f00b5e-15a7-42f1-977c-2c85748d7092"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 125 ms, sys: 16 ms, total: 141 ms\n",
-      "Wall time: 142 ms\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "# test the model by predicting its values for the test set\n",
-    "y_pred = cu_sgd.predict(X_cudf_test)\n",
-    "y_pred = to_nparray(y_pred).ravel()\n",
-    "# calculate the Mean Squared Error for the model's predictions\n",
-    "error_cu = mean_squared_error(y_test,y_pred)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 85
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "SHiYHcOmCw46"
+      },
+      "source": [
+        "## Helper Functions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "zNUHUf93Cw47",
+        "colab": {}
+      },
+      "source": [
+        "\"\"\"check if the mortgage dataset is present and then extract the data from it\n",
+        "or, if data is not here, create a random dataset for sgd\"\"\" \n",
+        "import gzip\n",
+        "# change the path of the mortgage dataset if saved in different directory\n",
+        "def load_data(nrows, ncols, cached = 'data/mortgage/mortgage.npy.gz'):\n",
+        "    # check for mortgage data\n",
+        "    if os.path.exists(cached):\n",
+        "        print(f'use mortgage data\\n')\n",
+        "\n",
+        "        with gzip.open(cached) as f:\n",
+        "            X = np.load(f)\n",
+        "        # the 4th column is 'adj_remaining_months_to_maturity'\n",
+        "        # used as the label\n",
+        "        X = X[:,[i for i in range(X.shape[1]) if i!=4]]\n",
+        "        y = X[:,4:5]\n",
+        "        rindices = np.random.randint(0, X.shape[0]-1, nrows)\n",
+        "        X = X[rindices,:ncols]\n",
+        "        y = y[rindices]\n",
+        "    # mortgage data not found\n",
+        "    else:\n",
+        "        # create a random dataset\n",
+        "        print(f'use random data\\n')\n",
+        "        X = np.random.rand(nrows,ncols)\n",
+        "        y = np.random.randint(0,10,size=(nrows,1))\n",
+        "    train_rows = int(nrows*0.8)\n",
+        "    # set train & test data\n",
+        "    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] \n",
+        "                               for i in range(X.shape[1])})\n",
+        "    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] \n",
+        "                              for i in range(X.shape[1])})\n",
+        "    df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] \n",
+        "                               for i in range(y.shape[1])})\n",
+        "    df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] \n",
+        "                              for i in range(y.shape[1])})\n",
+        "    return df_X_train, df_X_test, df_y_train, df_y_test\n",
+        "  \n",
+        "  \n",
+        "\"\"\"checks if the results obtained from \n",
+        "two different methods (sklearn and cuml) are the same\n",
+        "\"\"\"\n",
+        "from sklearn.metrics import mean_squared_error\n",
+        "def array_equal(a, b, threshold=2e-3, with_sign=True):\n",
+        "    a = to_nparray(a).ravel()\n",
+        "    b = to_nparray(b).ravel()\n",
+        "    if with_sign == False:\n",
+        "        a, b = np.abs(a), np.abs(b)\n",
+        "    error = mean_squared_error(a, b)\n",
+        "    res = error < threshold\n",
+        "    return res\n",
+        "\n",
+        "  \n",
+        "\"\"\"convert a variable from ndarray or dataframe format to numpy array\n",
+        "\"\"\"\n",
+        "def to_nparray(x):\n",
+        "    if isinstance(x, np.ndarray) or isinstance(x, pd.DataFrame):\n",
+        "        return np.array(x)\n",
+        "    elif isinstance(x, np.float64):\n",
+        "        return np.array([x])\n",
+        "    elif isinstance(x, cudf.DataFrame) or isinstance(x, cudf.Series):\n",
+        "        return x.to_pandas().values\n",
+        "    return x"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "O8-VlB1JCw5A"
+      },
+      "source": [
+        "### Prep Data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "favFgpCVCw5B",
+        "outputId": "5dff95dd-6ee7-4411-f841-68463fc654ce",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 151
+        }
+      },
+      "source": [
+        "%%time\n",
+        "# nrows = number of samples\n",
+        "# ncols = number of features of each sample\n",
+        "nrows = 2**20\n",
+        "ncols = 399\n",
+        "\n",
+        "# dataset is split into a ratio of 80:20, \n",
+        "# 80% is used as the training data & the remaining 20% is used as the test data\n",
+        "X_train, X_test, y_train, y_test = load_data(nrows,ncols)\n",
+        "y_train_ser = y_train['fea0']\n",
+        "print('training data',X_train.shape)\n",
+        "print('training label',y_train.shape)\n",
+        "print('testing data',X_test.shape)\n",
+        "print('testing label',y_test.shape)"
+      ],
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "use mortgage data\n",
+            "\n",
+            "training data (838860, 399)\n",
+            "training label (838860, 1)\n",
+            "testing data (209716, 399)\n",
+            "testing label (209716, 1)\n",
+            "CPU times: user 22.2 s, sys: 1.97 s, total: 24.2 s\n",
+            "Wall time: 24.2 s\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "w_exXx_UCw5G"
+      },
+      "source": [
+        "### Set Parameters\n",
+        "- Here we set the parameters used by both libraries\n",
+        "- You can change the number of iterations used by changing the `iterations` variable\n",
+        "- Please note that making this too high can cause the functions to take a long time to complete"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "DTT1OGzxCw5H",
+        "colab": {}
+      },
+      "source": [
+        "learning_rate = 'adaptive'\n",
+        "datatype = np.float32\n",
+        "penalty = 'elasticnet'\n",
+        "loss = 'squared_loss'\n",
+        "iterations = 10 "
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "uymvkHgWCw5K"
+      },
+      "source": [
+        "# scikit-learn vs cuML\n",
+        "- The `max_iter` parameter controls the maxixmum number of iterations the model can run for \n",
+        "  - but it doesn’t guarantee that the model will definitely run for all those epochs, \n",
+        "    - therefore the sklearn might run for less number of epochs than the cuML model"
+      ]
     },
-    "colab_type": "code",
-    "id": "oaXYoGNgCw5c",
-    "outputId": "fb79a051-4756-477a-c6bd-c2a0a6b0224d"
-   },
-   "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "SKL MSE(y):\n",
-      "8.413918591735838\n",
-      "CUML MSE(y):\n",
-      "28.550902172461804\n"
-     ]
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "FvZdm9lWCw5L",
+        "outputId": "b73e7fdd-054f-46f4-de0a-89a153a9566c",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 104
+        }
+      },
+      "source": [
+        "%%time\n",
+        "# use the sklearn SGD Regressor model to fit the dataset \n",
+        "sk_sgd = SGDRegressor(learning_rate=learning_rate, eta0=0.07,\n",
+        "                       max_iter=iterations, tol=0.0, fit_intercept=True,\n",
+        "                       penalty=penalty, loss=loss)\n",
+        "sk_sgd.fit(X_train, y_train_ser)\n"
+      ],
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "CPU times: user 37.7 s, sys: 1.31 s, total: 39 s\n",
+            "Wall time: 39 s\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/stochastic_gradient.py:1185: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.\n",
+            "  ConvergenceWarning)\n"
+          ],
+          "name": "stderr"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "EX8-fUSPCw5N",
+        "outputId": "b8cd2b06-2ebf-4302-d4e6-ddb7079a60e9",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 50
+        }
+      },
+      "source": [
+        "%%time\n",
+        "# test the model by predicting its results for the unseen test set\n",
+        "y_sk = sk_sgd.predict(X_test)\n",
+        "\n",
+        "# calculate the Mean Squared Error for the model's predictions\n",
+        "error_sk = mean_squared_error(y_test, y_sk)\n"
+      ],
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "CPU times: user 374 ms, sys: 72 ms, total: 446 ms\n",
+            "Wall time: 364 ms\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "baA0XjYgCw5T",
+        "outputId": "b6181188-26f3-469d-c2b0-ebedb1ef90ba",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 50
+        }
+      },
+      "source": [
+        "%%time\n",
+        "# convert the pandas dataframe to cuDF dataframe and series\n",
+        "X_cudf = cudf.DataFrame.from_pandas(X_train)\n",
+        "X_cudf_test = cudf.DataFrame.from_pandas(X_test)\n",
+        "y_cudf = cudf.Series(y_train_ser)"
+      ],
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "CPU times: user 2.4 s, sys: 1.21 s, total: 3.61 s\n",
+            "Wall time: 3.84 s\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "ku8uGfhHCw5W",
+        "outputId": "a04ecc70-8c18-448d-b99c-d3bc27498e22",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 50
+        }
+      },
+      "source": [
+        "%%time\n",
+        "# fit the training data on cuML's implementation of SGD\n",
+        "cu_sgd = cumlSGD(learning_rate=learning_rate, eta0=0.07, epochs=iterations, #epochs == n_iter\n",
+        "                 batch_size=512,\n",
+        "                 tol=0.0, penalty=penalty, loss=loss)\n",
+        "cu_sgd.fit(X_cudf, y_cudf)\n"
+      ],
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "CPU times: user 3.9 s, sys: 1.13 s, total: 5.03 s\n",
+            "Wall time: 6.2 s\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "lj86ouxvCw5a",
+        "outputId": "a284c485-71bc-4e9d-e5a0-f85ff738b782",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 50
+        }
+      },
+      "source": [
+        "%%time\n",
+        "# test the model by predicting its values for the test set\n",
+        "y_pred = cu_sgd.predict(X_cudf_test)\n",
+        "y_pred = to_nparray(y_pred).ravel()\n",
+        "# calculate the Mean Squared Error for the model's predictions\n",
+        "error_cu = mean_squared_error(y_test, y_pred)"
+      ],
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "CPU times: user 113 ms, sys: 6.01 ms, total: 119 ms\n",
+            "Wall time: 121 ms\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "oaXYoGNgCw5c",
+        "outputId": "211d827e-80cd-4031-ea5a-1612f160d1ad",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 84
+        }
+      },
+      "source": [
+        "# print the MSE of the sklearn and cuML models to compare them\n",
+        "print(\"SKL MSE(y):\")\n",
+        "print(error_sk)\n",
+        "print(\"CUML MSE(y):\")\n",
+        "print(error_cu)"
+      ],
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "SKL MSE(y):\n",
+            "1.1531636925834887e-07\n",
+            "CUML MSE(y):\n",
+            "1.0572256e-07\n"
+          ],
+          "name": "stdout"
+        }
+      ]
     }
-   ],
-   "source": [
-    "# print the MSE of the sklearn and cuML models to compare them\n",
-    "print(\"SKL MSE(y):\")\n",
-    "print(error_sk)\n",
-    "print(\"CUML MSE(y):\")\n",
-    "print(error_cu)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "mC8MvNWFCw5g"
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "name": "sgd_demo.ipynb",
-   "provenance": [],
-   "version": "0.3.2"
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
+  ]
 }