Snowflake-Labs · sfc-gh-oazarmanesh · Aug 23, 2023
diff --git a/hol/.ipynb_checkpoints/2_1_DEMO_model_building_scoring-checkpoint.ipynb b/hol/.ipynb_checkpoints/2_1_DEMO_model_building_scoring-checkpoint.ipynb
diff --git a/hol/.ipynb_checkpoints/3_1_SOLUTION_additional_models_xgboost-checkpoint.ipynb b/hol/.ipynb_checkpoints/3_1_SOLUTION_additional_models_xgboost-checkpoint.ipynb
diff --git a/hol/.ipynb_checkpoints/3_2_SOLUTION_additional_models_lightgbm-checkpoint.ipynb b/hol/.ipynb_checkpoints/3_2_SOLUTION_additional_models_lightgbm-checkpoint.ipynb
diff --git a/hol/.ipynb_checkpoints/4_1_DEMO_hyperparameter_tuning_gridsearch-checkpoint.ipynb b/hol/.ipynb_checkpoints/4_1_DEMO_hyperparameter_tuning_gridsearch-checkpoint.ipynb
diff --git a/hol/1_1_DEMO_basic_data_exploration_transformation.ipynb b/hol/1_1_DEMO_basic_data_exploration_transformation.ipynb
diff --git a/hol/1_3_DEMO_full_data_exploration_transformation.ipynb b/hol/1_3_DEMO_full_data_exploration_transformation.ipynb
diff --git a/hol/2_1_DEMO_model_building_scoring.ipynb b/hol/2_1_DEMO_model_building_scoring.ipynb
diff --git a/hol/3_1_SOLUTION_additional_models_xgboost.ipynb b/hol/3_1_SOLUTION_additional_models_xgboost.ipynb
diff --git a/hol/3_2_SOLUTION_additional_models_lightgbm.ipynb b/hol/3_2_SOLUTION_additional_models_lightgbm.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "2e339e77",
    "metadata": {},
@@ -10,7 +9,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "73ac09a3",
    "metadata": {},
@@ -19,7 +17,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "a7d72538",
    "metadata": {},
@@ -37,14 +34,14 @@
     "from snowflake.snowpark.session import Session\n",
     "import snowflake.snowpark.types as T\n",
     "\n",
+    "import joblib\n",
     "from snowflake.ml.modeling.lightgbm import LGBMClassifier\n",
     "\n",
     "import json\n",
     "import pandas as pd"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "c4442706",
    "metadata": {},
@@ -64,7 +61,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "11413743",
    "metadata": {},
@@ -96,7 +92,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "24d093ad",
    "metadata": {},
@@ -116,44 +111,159 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "b36c435c",
    "metadata": {},
    "source": [
     "# SOLUTION: Train a LightGBM Model on a Snowpark DataFrame using snowpark-ml"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91221b05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This local Python-function will be registered as a Stored Procedure and runs in Snowflake\n",
+    "sk_version = \"1.2.2\"\n",
+    "snowpark_version = \"1.3.0\"\n",
+    "pandas_version = \"1.5.3\"\n",
+    "lightgbm_version = \"3.3.5\"\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "584b35e7",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "def sproc_train_lightgbm_model(session: Session, \n",
+    "                               training_table: str, \n",
+    "                               feature_cols: list,\n",
+    "                               target_col: str,\n",
+    "                               model_name: str) -> T.Variant:\n",
+    "    \n",
+    "    # WORKFLOW\n",
+    "    # 1: Load data into Pandas DataFrame\n",
+    "    # 2: Define features and Label\n",
+    "    # 3: Train the model\n",
+    "    # 4: (Optional) Return feature importance\n",
+    "    # 5: Save the model and upload to Snowflake Stage\n",
+    "    # 6: Return feature importance or success-message\n",
+    "    # Hint: Make sure return is json-compatible (e.g. via calling to_dict())\n",
+    "\n",
+    "    # 1: Load data into Pandas DataFrame\n",
+    "    local_training_data = training_table.to_pandas()\n",
+    "\n",
+    "    # 2: Define features and Label\n",
+    "    X = local_training_data[feature_cols]\n",
+    "    y = local_training_data[target_col]\n",
+    "    \n",
+    "    # 3: Train the model\n",
+    "    lgbmodel = LGBMClassifier(input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')\n",
+    "    lgbmodel.fit(training_table)\n",
+    "    \n",
+    "    # 4: (Optional) Return feature importance\n",
+    "    feat_importance = pd.DataFrame(lgbmodel.to_lightgbm().feature_importances_,feature_cols,columns=['FeatImportance'])\n",
+    "\n",
+    "    # 5: Save the model and upload to Snowflake Stage\n",
+    "    joblib.dump(lgbmodel, \"/tmp/\" + model_name)\n",
+    "    session.file.put(\"/tmp/\" + model_name, \"@MODEL_STAGE\", auto_compress=False, overwrite=True)\n",
+    "    \n",
+    "    # 6: Return feature importance or success-message\n",
+    "    feat_importance.sort_values('FeatImportance').plot.barh(y='FeatImportance', figsize=(5,15))\n",
+    "\n",
+    "\n",
+    "    return feat_importance"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e38d9398",
+   "metadata": {},
+   "source": [
+    "# SOLUTION: Register your Stored Procedure to train an LightGBM Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27de1ca9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "session.sql(\"CREATE STAGE IF NOT EXISTS model_stage\").collect()\n",
+    "pd.DataFrame(session.sql(\"LIST @MODEL_STAGE\").collect())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d578865",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "session.add_packages(\"snowflake-snowpark-python==1.5.1\",\n",
+    " \"pandas==1.5.3\", \"scikit-learn==1.2.2\", \"lightgbm==3.3.5\",\n",
+    " \"xgboost==1.7.3\", \"joblib==1.2.0\", \"imbalanced-learn==0.10.1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03b45ce7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Registering the function as a Stored Procedure\n",
+    "sproc_train_lightgbm_model = session.sproc.register(func=sproc_train_lightgbm_model, \n",
+    "                                                    name='sproc_train_lgbm_model', \n",
+    "                                                    is_permanent=True, \n",
+    "                                                    replace=True, \n",
+    "                                                    stage_location='MODEL_STAGE')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "979b1b1c",
+   "metadata": {},
+   "source": [
+    "# SOLUTION: Run your Stored Procedure to train an LightGBM Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01b72e16",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "feature_cols = train_sdf.columns\n",
     "feature_cols.remove('TARGET')\n",
     "feature_cols.remove('ID')\n",
-    "target_col = 'TARGET'\n",
     "\n",
-    "lgbmodel = LGBMClassifier(input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')\n",
-    "lgbmodel.fit(train_sdf)"
+    "result = sproc_train_lightgbm_model(training_table=train_sdf, \n",
+    "                                    feature_cols=feature_cols, \n",
+    "                                    target_col='TARGET',\n",
+    "                                    model_name=\"train_lgbm_model\", \n",
+    "                                    session=session)\n",
+    "result"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "bedab745",
+   "id": "c1239449",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Plot feature importance\n",
-    "feat_importance = pd.DataFrame(lgbmodel.get_sklearn_object().feature_importances_,feature_cols,columns=['FeatImportance'])\n",
-    "feat_importance.sort_values('FeatImportance').plot.barh(y='FeatImportance', figsize=(5,15))"
+    "# The model is now stored in a Snowflake stage\n",
+    "pd.DataFrame(session.sql('LIST @MODEL_STAGE').collect()) "
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "19a60347",
    "metadata": {},
@@ -162,7 +272,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "95c5ff7a",
    "metadata": {},
@@ -171,12 +280,27 @@
    ]
   },
   {
-   "attachments": {},
+   "cell_type": "markdown",
+   "id": "64569d68",
+   "metadata": {},
+   "source": [
+    "# SOLUTION: Define your UDF to Score an LightGBM Model"
+   ]
+  },
+  {
    "cell_type": "markdown",
    "id": "3f2baa9c",
    "metadata": {},
    "source": [
-    "# SOLUTION: Use the fitted LightGBM Model to score a Snowpark DataFrame"
+    "# SOLUTION: Register your UDF to Score an LightGBM Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9ecebe0",
+   "metadata": {},
+   "source": [
+    "# SOLUTION: Run your UDF to Score an LightGBM Model"
    ]
   },
   {
@@ -218,7 +342,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.8.17"
   }
  },
  "nbformat": 4,