From e7095c93cec15fb53f78c587f4fdec0fef3e8fb7 Mon Sep 17 00:00:00 2001 From: Winston <43570913+gumdropsteve@users.noreply.github.com> Date: Fri, 2 Aug 2019 03:58:44 -0700 Subject: [PATCH] Add mortgage data download Added mortgage data download from rapidsai GitHub (notebooks-extended/data); rerun start to finish for updated results with downloaded data instead of random data. MSE was severely lowered for both sklearn and cuML. Aesthetic adjustments include: merge Helper Function defining to 1 cell from 2; break down paragraphs into easy to follow bullet points --- colab_notebooks/cuml/sgd_demo.ipynb | 958 ++++++++++++++-------------- 1 file changed, 493 insertions(+), 465 deletions(-) diff --git a/colab_notebooks/cuml/sgd_demo.ipynb b/colab_notebooks/cuml/sgd_demo.ipynb index 23406789..552fa7ad 100644 --- a/colab_notebooks/cuml/sgd_demo.ipynb +++ b/colab_notebooks/cuml/sgd_demo.ipynb @@ -1,486 +1,514 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hGL2yxA7Cw41" - }, - "source": [ - "# Stochastic gradient descent (SGD) \n", - "SGD is an incremental gradient descent algorithm which modifies its weights, in an effort to reach a local minimum. \n", - "\n", - "The cuML implementation takes only numpy arrays and cuDF datasets as inputs. \n", - "\n", - "- In order to convert your dataset into a cuDF dataframe format please refer the [cuDF documentation](https://rapidsai.github.io/projects/cudf/en/latest/) \n", - "\n", - "The SGD algorithm implemented in cuML can accept the following parameters:\n", - "1. `loss` : 'hinge', 'log', 'squared_loss' (default = 'squared_loss')\n", - "2. `penalty`: 'none', 'l1', 'l2', 'elasticnet' (default = 'none')\n", - "3. `alpha`: float (default = 0.0001)\n", - "4. `fit_intercept` : boolean (default = True)\n", - "5. `epochs` : int (default = 1000)\n", - "6. `tol` : float (default = 1e-3)\n", - "7. `shuffle` : boolean (default = True)\n", - "8. `eta0` : float (default = 0.0)\n", - "9. `power_t` : float (default = 0.5)\n", - "10. `learning_rate` : 'optimal', 'constant', 'invscaling', 'adaptive' (default = 'constant')\n", - "11. `n_iter_no_change` : int (default = 5)\n", - "\n", - "For additional information on the SGD model please refer to the [cuML documentation](https://rapidsai.github.io/projects/cuml/en/latest/index.html)\n", - "- this setup may take a few minutes\n", - "- long output (output display removed)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "aVB1xbBuDBeY" - }, - "outputs": [], - "source": [ - "!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh\n", - "!bash rapids-colab.sh\n", - "\n", - "import sys, os\n", - "\n", - "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n", - "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", - "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Imports" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "7wrS9WpmCw42" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import cudf\n", - "from cuml.solvers import SGD as cumlSGD\n", - "from sklearn.linear_model import SGDRegressor" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "SHiYHcOmCw46" - }, - "source": [ - "# Helper Functions" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zNUHUf93Cw47" - }, - "outputs": [], - "source": [ - "# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for sgd \n", - "import gzip\n", - "# change the path of the mortgage dataset if you have saved it in a different directory\n", - "def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):\n", - " if os.path.exists(cached):\n", - " print('use mortgage data')\n", - "\n", - " with gzip.open(cached) as f:\n", - " X = np.load(f)\n", - " # the 4th column is 'adj_remaining_months_to_maturity'\n", - " # used as the label\n", - " X = X[:,[i for i in range(X.shape[1]) if i!=4]]\n", - " y = X[:,4:5]\n", - " rindices = np.random.randint(0,X.shape[0]-1,nrows)\n", - " X = X[rindices,:ncols]\n", - " y = y[rindices]\n", - "\n", - " else:\n", - " # create a random dataset\n", - " print('use random data')\n", - " X = np.random.rand(nrows,ncols)\n", - " y = np.random.randint(0,10,size=(nrows,1))\n", - " train_rows = int(nrows*0.8)\n", - " df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})\n", - " df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})\n", - " df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})\n", - " df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})\n", - " return df_X_train, df_X_test, df_y_train, df_y_test\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "lmIpjEzBCw49" - }, - "outputs": [], - "source": [ - "# this function checks if the results obtained from two different methods (sklearn and cuml) are the same\n", - "from sklearn.metrics import mean_squared_error\n", - "def array_equal(a,b,threshold=2e-3,with_sign=True):\n", - " a = to_nparray(a).ravel()\n", - " b = to_nparray(b).ravel()\n", - " if with_sign == False:\n", - " a,b = np.abs(a),np.abs(b)\n", - " error = mean_squared_error(a,b)\n", - " res = error