From 224b835c09cd2f7d917305a52d326722263604f4 Mon Sep 17 00:00:00 2001 From: sfc-gh-oazarmanesh Date: Wed, 23 Aug 2023 14:20:48 -0700 Subject: [PATCH] Changes made to match updates to snowflake.ml package --- ...MO_model_building_scoring-checkpoint.ipynb | 672 ++++++++ ...additional_models_xgboost-checkpoint.ipynb | 219 +++ ...dditional_models_lightgbm-checkpoint.ipynb | 216 +++ ...rameter_tuning_gridsearch-checkpoint.ipynb | 236 +++ ...asic_data_exploration_transformation.ipynb | 749 ++++++++- ...full_data_exploration_transformation.ipynb | 1434 +++++++++++++++-- hol/2_1_DEMO_model_building_scoring.ipynb | 327 +++- ...1_SOLUTION_additional_models_xgboost.ipynb | 109 +- ..._SOLUTION_additional_models_lightgbm.ipynb | 162 +- ...EMO_hyperparameter_tuning_gridsearch.ipynb | 70 +- 10 files changed, 3901 insertions(+), 293 deletions(-) create mode 100644 hol/.ipynb_checkpoints/2_1_DEMO_model_building_scoring-checkpoint.ipynb create mode 100644 hol/.ipynb_checkpoints/3_1_SOLUTION_additional_models_xgboost-checkpoint.ipynb create mode 100644 hol/.ipynb_checkpoints/3_2_SOLUTION_additional_models_lightgbm-checkpoint.ipynb create mode 100644 hol/.ipynb_checkpoints/4_1_DEMO_hyperparameter_tuning_gridsearch-checkpoint.ipynb diff --git a/hol/.ipynb_checkpoints/2_1_DEMO_model_building_scoring-checkpoint.ipynb b/hol/.ipynb_checkpoints/2_1_DEMO_model_building_scoring-checkpoint.ipynb new file mode 100644 index 0000000..7bf0f3c --- /dev/null +++ b/hol/.ipynb_checkpoints/2_1_DEMO_model_building_scoring-checkpoint.ipynb @@ -0,0 +1,672 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2e339e77", + "metadata": {}, + "source": [ + "# DEMO: Model Building and Scoring (Logistic Regression)" + ] + }, + { + "cell_type": "markdown", + "id": "73ac09a3", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "a7d72538", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2cb04fd9", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.snowpark.session import Session\n", + "import snowflake.snowpark.functions as F\n", + "import snowflake.snowpark.types as T\n", + "\n", + "from snowflake.ml.modeling.linear_model import LogisticRegression\n", + "\n", + "import json\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "id": "22a0aaa1", + "metadata": {}, + "source": [ + "Get installed versions of the libraries we will use later when deploying to Snowflake" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e3b512a8", + "metadata": {}, + "outputs": [], + "source": [ + "with open('packages_version.json') as f:\n", + " packages_version = json.load(f)" + ] + }, + { + "cell_type": "markdown", + "id": "11413743", + "metadata": {}, + "source": [ + "## Create Snowpark Session" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "381310d7", + "metadata": {}, + "outputs": [], + "source": [ + "with open('creds.json') as f:\n", + " connection_parameters = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9cc080c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Database and schema: \"HOL_DB\".\"PUBLIC\"\n", + "Current Warehouse: \"HOL_WH\"\n" + ] + } + ], + "source": [ + "session = Session.builder.configs(connection_parameters).create()\n", + "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", + "print(f\"Current Warehouse: {session.get_current_warehouse()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2a42a5d2", + "metadata": {}, + "source": [ + "## Prepare Snowflake Stage to store Models" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d264e18c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status='Stage area ML_MODELS successfully created.')]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "session.sql('CREATE OR REPLACE STAGE ML_MODELS').collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7c293f32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_onnamedatabase_nameschema_nameurlhas_credentialshas_encryption_keyownercommentregiontypecloudnotification_channelstorage_integrationowner_role_type
02023-08-15 10:26:49.058000-07:00ML_MODELSHOL_DBPUBLICNNACCOUNTADMINNoneINTERNALNoneNoneNoneROLE
12023-08-15 10:26:15.749000-07:00ML_PROCSHOL_DBPUBLICNNACCOUNTADMINNoneINTERNALNoneNoneNoneROLE
22023-08-14 20:24:28.304000-07:00MODEL_STAGEHOL_DBPUBLICNNACCOUNTADMINNoneINTERNALNoneNoneNoneROLE
\n", + "
" + ], + "text/plain": [ + " created_on name database_name schema_name url \\\n", + "0 2023-08-15 10:26:49.058000-07:00 ML_MODELS HOL_DB PUBLIC \n", + "1 2023-08-15 10:26:15.749000-07:00 ML_PROCS HOL_DB PUBLIC \n", + "2 2023-08-14 20:24:28.304000-07:00 MODEL_STAGE HOL_DB PUBLIC \n", + "\n", + " has_credentials has_encryption_key owner comment region type \\\n", + "0 N N ACCOUNTADMIN None INTERNAL \n", + "1 N N ACCOUNTADMIN None INTERNAL \n", + "2 N N ACCOUNTADMIN None INTERNAL \n", + "\n", + " cloud notification_channel storage_integration owner_role_type \n", + "0 None None None ROLE \n", + "1 None None None ROLE \n", + "2 None None None ROLE " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(session.sql('SHOW STAGES').collect())" + ] + }, + { + "cell_type": "markdown", + "id": "24d093ad", + "metadata": {}, + "source": [ + "# Model Building\n", + "\n", + "We will use the balanced data set we created in the last part." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "570a3776", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|0 |29819 |\n", + "|1 |29819 |\n", + "----------------------\n", + "\n" + ] + } + ], + "source": [ + "application_record_balanced_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED')\n", + "application_record_balanced_sdf.group_by('TARGET').count().show()" + ] + }, + { + "cell_type": "markdown", + "id": "dd19116e", + "metadata": {}, + "source": [ + "We are going to use stratified sampling of the data. Using **sample_by** allows us to sample based on one column and also specify the fractions of each value in the column." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d91ebf56", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|1 |23868 |\n", + "|0 |23843 |\n", + "----------------------\n", + "\n" + ] + } + ], + "source": [ + "train_sdf = application_record_balanced_sdf.sample_by(\"TARGET\", {1: 0.8, 0: 0.8})\n", + "train_sdf = train_sdf.cache_result()\n", + "train_sdf.group_by('TARGET').count().show()" + ] + }, + { + "cell_type": "markdown", + "id": "ca734c35", + "metadata": {}, + "source": [ + "The test data set is the rest of the rows, by using **minus** we can exclude the rows from the train_sdf DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0297ff43", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|0 |5976 |\n", + "|1 |4850 |\n", + "----------------------\n", + "\n" + ] + } + ], + "source": [ + "test_sdf = application_record_balanced_sdf.minus(train_sdf)\n", + "test_sdf.group_by('TARGET').count().show()" + ] + }, + { + "cell_type": "markdown", + "id": "c3fe042a", + "metadata": {}, + "source": [ + "Save the training and test data into tables" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f455553a", + "metadata": {}, + "outputs": [], + "source": [ + "train_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TRAIN', mode='overwrite')\n", + "test_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TEST', mode='overwrite')\n", + "\n", + "train_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TRAIN')\n", + "test_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TEST')" + ] + }, + { + "cell_type": "markdown", + "id": "a4083f48", + "metadata": {}, + "source": [ + "Fit a LogisticRegression model using snowpark-ml, the fitting is pushed down to Snowflake and is using scikit-learn behind the scene." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6983af6f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_cols = train_sdf.columns\n", + "feature_cols.remove('TARGET')\n", + "feature_cols.remove('ID')\n", + "target_col = 'TARGET'\n", + "\n", + "lm = LogisticRegression(C=0.8, solver='lbfgs',random_state=0, input_cols=feature_cols, label_cols=target_col, output_cols=['PREDICTION'])\n", + "lm.fit(train_sdf)" + ] + }, + { + "cell_type": "markdown", + "id": "b9e4c195", + "metadata": {}, + "source": [ + "Plot Feature Coefficients" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5e951c2f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "feature_coefficients = pd.DataFrame(lm.to_sklearn().coef_.T,lm.to_sklearn().feature_names_in_,columns=['Coefficient'])\n", + "feature_coefficients.sort_values('Coefficient').plot.barh(y='Coefficient', figsize=(5,15))" + ] + }, + { + "cell_type": "markdown", + "id": "8097c837", + "metadata": {}, + "source": [ + "The fitted model can be retrived as a scickit-learn object if it would be used outside Snwoflake." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b1fbbbaf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sklearn.linear_model._logistic.LogisticRegression" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "skl_ml = lm.to_sklearn()\n", + "type(skl_ml)" + ] + }, + { + "cell_type": "markdown", + "id": "19a60347", + "metadata": {}, + "source": [ + "# Model Scoring" + ] + }, + { + "cell_type": "markdown", + "id": "95c5ff7a", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "acbb5608", + "metadata": {}, + "source": [ + "Using the predict method of the snowpark-ml model will push down the scoring into Snowflake, it can be applied on a Snowpark DataFrame or a Pandas DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c28783e5", + "metadata": {}, + "outputs": [], + "source": [ + "scored_snowml_sdf = lm.predict(test_sdf)\n", + "# Saving it as a table so we do not call the predict function when using the scored DataFrame\n", + "scored_snowml_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TRAIN_SCORED', mode='overwrite')" + ] + }, + { + "cell_type": "markdown", + "id": "98cac9aa", + "metadata": {}, + "source": [ + "# Model Evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "e90db3e5", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "5381ced6", + "metadata": {}, + "outputs": [], + "source": [ + "scored_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TRAIN_SCORED')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5990a3f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------\n", + "|\"TARGET\" |\"CAST('0.0' AS FLOAT)\" |\"CAST('1.0' AS FLOAT)\" |\n", + "--------------------------------------------------------------\n", + "|1 |2182 |2668 |\n", + "|0 |3805 |2171 |\n", + "--------------------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "# Obtaining a simple confusion matrix\n", + "scored_sdf.crosstab('TARGET','PREDICTION').show()" + ] + }, + { + "cell_type": "markdown", + "id": "34e1289f", + "metadata": {}, + "source": [ + "Calculate the Accuracy, Precision, Recall and F1 metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a21ddd7f", + "metadata": {}, + "outputs": [], + "source": [ + "def calc_metrics(snf_df):\n", + " return snf_df.group_by(['TARGET','PREDICTION']).count()\\\n", + " .with_column(\"type\", F.when((F.col(\"TARGET\") == 0) & (F.col(\"PREDICTION\") == 0), \"tn\")\\\n", + " .when((F.col(\"TARGET\") == 0) & (F.col(\"PREDICTION\") == 1), \"fp\")\\\n", + " .when((F.col(\"TARGET\") == 1) & (F.col(\"PREDICTION\") == 0), \"fn\")\\\n", + " .when((F.col(\"TARGET\") == 1) & (F.col(\"PREDICTION\") == 1), \"tp\"))\\\n", + " .select([\"TYPE\", \"COUNT\"]).pivot(\"TYPE\", ['tn', 'tp', 'fn', 'fp']).sum(\"COUNT\")\\\n", + " .with_columns([\"accuracy\", \"precision\", \"recall\"],\n", + " [((F.col(\"'tp'\") + F.col(\"'tn'\")) / (F.col(\"'tp'\") + F.col(\"'tn'\") + F.col(\"'fn'\") + F.col(\"'fp'\")))\n", + " , (F.col(\"'tp'\") / (F.col(\"'tp'\") + F.col(\"'fp'\")))\n", + " ,(F.col(\"'tp'\") / (F.col(\"'tp'\") + F.col(\"'fn'\")))])\\\n", + " .with_column(\"f1\", (F.lit(2)*F.col(\"precision\")*F.col(\"recall\")) / (F.col(\"precision\")+F.col(\"recall\")))\\\n", + " .select([\"ACCURACY\",\"PRECISION\", \"RECALL\",\"F1\"]).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "16087bac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------\n", + "|\"ACCURACY\" |\"PRECISION\" |\"RECALL\" |\"F1\" |\n", + "--------------------------------------------------------\n", + "|0.597912 |0.551354 |0.550103 |0.550727789577 |\n", + "--------------------------------------------------------\n", + "\n" + ] + } + ], + "source": [ + "calc_metrics(scored_sdf)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f246388b", + "metadata": {}, + "outputs": [], + "source": [ + "session.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a81b7cd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hol/.ipynb_checkpoints/3_1_SOLUTION_additional_models_xgboost-checkpoint.ipynb b/hol/.ipynb_checkpoints/3_1_SOLUTION_additional_models_xgboost-checkpoint.ipynb new file mode 100644 index 0000000..11f4bf0 --- /dev/null +++ b/hol/.ipynb_checkpoints/3_1_SOLUTION_additional_models_xgboost-checkpoint.ipynb @@ -0,0 +1,219 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2e339e77", + "metadata": {}, + "source": [ + "# SOLUTION: Model Building and Scoring (XGBoost)" + ] + }, + { + "cell_type": "markdown", + "id": "73ac09a3", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "a7d72538", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cb04fd9", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.snowpark.session import Session\n", + "import snowflake.snowpark.types as T\n", + "\n", + "from snowflake.ml.modeling.xgboost import XGBClassifier\n", + "\n", + "import json\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "1f47f8e9", + "metadata": {}, + "source": [ + "Get installed versions of the libraries we will use later when deploying to Snowflake" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f82010d6", + "metadata": {}, + "outputs": [], + "source": [ + "with open('packages_version.json') as f:\n", + " packages_version = json.load(f)\n", + "\n", + "packages_version" + ] + }, + { + "cell_type": "markdown", + "id": "11413743", + "metadata": {}, + "source": [ + "## Create Snowpark Session" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25a54f72", + "metadata": {}, + "outputs": [], + "source": [ + "with open('creds.json') as f:\n", + " connection_parameters = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cc080c0", + "metadata": {}, + "outputs": [], + "source": [ + "session = Session.builder.configs(connection_parameters).create()\n", + "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", + "print(f\"Current Warehouse: {session.get_current_warehouse()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "24d093ad", + "metadata": {}, + "source": [ + "# Model Building" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4df6c514", + "metadata": {}, + "outputs": [], + "source": [ + "train_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TRAIN')\n", + "test_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TEST')" + ] + }, + { + "cell_type": "markdown", + "id": "74d762b8", + "metadata": {}, + "source": [ + "# SOLUTION: Train an XGBoost Model with ml-snowpark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0053ffc1", + "metadata": {}, + "outputs": [], + "source": [ + "feature_cols = train_sdf.columns\n", + "feature_cols.remove('TARGET')\n", + "feature_cols.remove('ID')\n", + "target_col = 'TARGET'\n", + "\n", + "xgbmodel = XGBClassifier(random_state=123, input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')\n", + "xgbmodel.fit(train_sdf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "794aab3b", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot feature importance\n", + "feat_importance = pd.DataFrame(xgbmodel.get_sklearn_object().feature_importances_,feature_cols,columns=['FeatImportance'])\n", + "feat_importance.sort_values('FeatImportance').plot.barh(y='FeatImportance', figsize=(5,15))" + ] + }, + { + "cell_type": "markdown", + "id": "19a60347", + "metadata": {}, + "source": [ + "# Model Scoring" + ] + }, + { + "cell_type": "markdown", + "id": "95c5ff7a", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "5046b342", + "metadata": {}, + "source": [ + "# SOLUTION: Use the fitted XGBoost Model to score a Snowpark DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fda8e99d", + "metadata": {}, + "outputs": [], + "source": [ + "# Score the data using the fitted xgbmodel\n", + "scored_sdf = xgbmodel.predict(test_sdf)\n", + "scored_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TEST_SCORED', mode='overwrite')\n", + "session.table('CREDIT_RISK_PREPARED_BALANCED_TEST_SCORED').show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1ffaf68", + "metadata": {}, + "outputs": [], + "source": [ + "session.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hol/.ipynb_checkpoints/3_2_SOLUTION_additional_models_lightgbm-checkpoint.ipynb b/hol/.ipynb_checkpoints/3_2_SOLUTION_additional_models_lightgbm-checkpoint.ipynb new file mode 100644 index 0000000..b64256c --- /dev/null +++ b/hol/.ipynb_checkpoints/3_2_SOLUTION_additional_models_lightgbm-checkpoint.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2e339e77", + "metadata": {}, + "source": [ + "# SOLUTION: Model Building and Scoring (LightGBM)" + ] + }, + { + "cell_type": "markdown", + "id": "73ac09a3", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "a7d72538", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cb04fd9", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.snowpark.session import Session\n", + "import snowflake.snowpark.types as T\n", + "\n", + "from snowflake.ml.modeling.lightgbm import LGBMClassifier\n", + "\n", + "import json\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "c4442706", + "metadata": {}, + "source": [ + "Get installed versions of the libraries we will use later when deploying to Snowflake" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53eceb1b", + "metadata": {}, + "outputs": [], + "source": [ + "with open('packages_version.json') as f:\n", + " packages_version = json.load(f)" + ] + }, + { + "cell_type": "markdown", + "id": "11413743", + "metadata": {}, + "source": [ + "## Create Snowpark Session" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04bad29b", + "metadata": {}, + "outputs": [], + "source": [ + "with open('creds.json') as f:\n", + " connection_parameters = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cc080c0", + "metadata": {}, + "outputs": [], + "source": [ + "session = Session.builder.configs(connection_parameters).create()\n", + "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", + "print(f\"Current Warehouse: {session.get_current_warehouse()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "24d093ad", + "metadata": {}, + "source": [ + "# Model Building" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4df6c514", + "metadata": {}, + "outputs": [], + "source": [ + "train_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TRAIN')\n", + "test_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TEST')" + ] + }, + { + "cell_type": "markdown", + "id": "b36c435c", + "metadata": {}, + "source": [ + "# SOLUTION: Train a LightGBM Model on a Snowpark DataFrame using snowpark-ml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "584b35e7", + "metadata": {}, + "outputs": [], + "source": [ + "feature_cols = train_sdf.columns\n", + "feature_cols.remove('TARGET')\n", + "feature_cols.remove('ID')\n", + "target_col = 'TARGET'\n", + "\n", + "lgbmodel = LGBMClassifier(input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')\n", + "lgbmodel.fit(train_sdf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bedab745", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot feature importance\n", + "feat_importance = pd.DataFrame(lgbmodel.get_sklearn_object().feature_importances_,feature_cols,columns=['FeatImportance'])\n", + "feat_importance.sort_values('FeatImportance').plot.barh(y='FeatImportance', figsize=(5,15))" + ] + }, + { + "cell_type": "markdown", + "id": "19a60347", + "metadata": {}, + "source": [ + "# Model Scoring" + ] + }, + { + "cell_type": "markdown", + "id": "95c5ff7a", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "3f2baa9c", + "metadata": {}, + "source": [ + "# SOLUTION: Use the fitted LightGBM Model to score a Snowpark DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a31b8cc", + "metadata": {}, + "outputs": [], + "source": [ + "scored_sdf = lgbmodel.predict(test_sdf)\n", + "scored_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TEST_SCORED', mode='overwrite')\n", + "session.table('CREDIT_RISK_PREPARED_BALANCED_TEST_SCORED').show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f81857e0", + "metadata": {}, + "outputs": [], + "source": [ + "session.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hol/.ipynb_checkpoints/4_1_DEMO_hyperparameter_tuning_gridsearch-checkpoint.ipynb b/hol/.ipynb_checkpoints/4_1_DEMO_hyperparameter_tuning_gridsearch-checkpoint.ipynb new file mode 100644 index 0000000..6cae8a0 --- /dev/null +++ b/hol/.ipynb_checkpoints/4_1_DEMO_hyperparameter_tuning_gridsearch-checkpoint.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2e339e77", + "metadata": {}, + "source": [ + "# DEMO: Hyperparameter Tuning via GridSearchCV" + ] + }, + { + "cell_type": "markdown", + "id": "73ac09a3", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "a7d72538", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2cb04fd9", + "metadata": {}, + "outputs": [], + "source": [ + "from snowflake.snowpark.session import Session\n", + "import snowflake.snowpark.functions as F\n", + "import snowflake.snowpark.types as T\n", + "\n", + "from snowflake.ml.modeling.xgboost import XGBClassifier\n", + "from snowflake.ml.modeling.model_selection import GridSearchCV\n", + "\n", + "import json\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "91955f46", + "metadata": {}, + "source": [ + "Get installed versions of the libraries we will use later when deploying to Snowflake" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e640693e", + "metadata": {}, + "outputs": [], + "source": [ + "with open('packages_version.json') as f:\n", + " packages_version = json.load(f)" + ] + }, + { + "cell_type": "markdown", + "id": "11413743", + "metadata": {}, + "source": [ + "## Create Snowpark Session" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "04bad29b", + "metadata": {}, + "outputs": [], + "source": [ + "with open('creds.json') as f:\n", + " connection_parameters = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9cc080c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Database and schema: \"HOL_DB\".\"PUBLIC\"\n", + "Current Warehouse: \"HOL_WH\"\n" + ] + } + ], + "source": [ + "session = Session.builder.configs(connection_parameters).create()\n", + "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", + "print(f\"Current Warehouse: {session.get_current_warehouse()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "24d093ad", + "metadata": {}, + "source": [ + "# Model Building" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4df6c514", + "metadata": {}, + "outputs": [], + "source": [ + "train_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TRAIN')\n", + "test_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED_TEST')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "442a5131", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_cols = train_sdf.columns\n", + "feature_cols.remove('TARGET')\n", + "feature_cols.remove('ID')\n", + "target_col = 'TARGET'\n", + "\n", + "grid_search = GridSearchCV(\n", + " estimator=XGBClassifier(),\n", + " param_grid={\n", + " 'n_estimators':[10,50,100],\n", + " 'max_depth':[2,3,4]\n", + " },\n", + " n_jobs = -1,\n", + " input_cols = feature_cols,\n", + " label_cols = target_col,\n", + " output_cols = 'PREDICTION',\n", + " scoring=\"f1\",\n", + " )\n", + "\n", + "grid_search.fit(train_sdf)" + ] + }, + { + "cell_type": "markdown", + "id": "55c233ba", + "metadata": {}, + "source": [ + "## Analyze GridSearch Results" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "46794ab5", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Analyze grid search results\n", + "gs_results = grid_search.to_sklearn().cv_results_\n", + "n_estimators_val = []\n", + "max_depth_val = []\n", + "for param_dict in gs_results[\"params\"]:\n", + " n_estimators_val.append(param_dict[\"n_estimators\"])\n", + " max_depth_val.append(param_dict[\"max_depth\"])\n", + "\n", + "f1_val = gs_results[\"mean_test_score\"]\n", + "gs_results_df = pd.DataFrame(data={\"n_estimators\":n_estimators_val,\n", + " \"max_depth\":max_depth_val, \n", + " 'f1_val':f1_val})\n", + "sns.relplot(data=gs_results_df, x=\"max_depth\", y=\"f1_val\", hue=\"n_estimators\", kind='line')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db5c6afd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hol/1_1_DEMO_basic_data_exploration_transformation.ipynb b/hol/1_1_DEMO_basic_data_exploration_transformation.ipynb index 14b3c9d..05ac526 100644 --- a/hol/1_1_DEMO_basic_data_exploration_transformation.ipynb +++ b/hol/1_1_DEMO_basic_data_exploration_transformation.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "3b648ee8", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2e5d9443", "metadata": {}, @@ -19,7 +17,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1ed15903", "metadata": {}, @@ -29,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "7410ce0d", "metadata": {}, "outputs": [], @@ -38,7 +35,8 @@ "import snowflake.snowpark.functions as F\n", "import snowflake.snowpark.types as T\n", "from snowflake.snowpark.window import Window\n", - "from snowflake.ml.preprocessing import *\n", + "from snowflake.ml.modeling.preprocessing import *\n", + "from snowflake.ml.modeling.impute import *\n", "\n", "import sys\n", "import json\n", @@ -48,7 +46,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a1f95920", "metadata": {}, @@ -58,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9a7da15f", "metadata": {}, "outputs": [], @@ -69,10 +66,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "49090e02", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Database and schema: \"HOL_DB\".\"PUBLIC\"\n", + "Current Warehouse: \"HOL_WH\"\n" + ] + } + ], "source": [ "session = Session.builder.configs(connection_parameters).create()\n", "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", @@ -80,7 +86,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "ba1c0c26", "metadata": {}, @@ -90,10 +95,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "8e6d1164", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "# Creating a Pandas DataFrame\n", "pandas_df = pd.read_csv('data/application_record.csv.zip')\n", @@ -102,10 +115,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "623695f6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "# Creating a Snowpark DataFrame\n", "snowpark_df = session.table('APPLICATION_RECORD')\n", @@ -114,10 +135,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "b2d4f6b3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size in MB of Pandas DataFrame in Memory:\n", + " 251.15\n", + "Size in MB of Snowpark DataFrame in Memory:\n", + " 0.0\n" + ] + } + ], "source": [ "# Compare size\n", "print('Size in MB of Pandas DataFrame in Memory:\\n', np.round(sys.getsizeof(pandas_df) / (1024.0**2), 2))\n", @@ -125,7 +157,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "03ea0fc5", "metadata": {}, @@ -135,16 +166,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "43db7633", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'queries': ['SELECT * FROM (APPLICATION_RECORD)'], 'post_actions': []}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "snowpark_df.queries" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "d3cf44c3", "metadata": {}, @@ -154,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "e1ec1ddc", "metadata": {}, "outputs": [], @@ -163,7 +204,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a7e707de", "metadata": {}, @@ -173,16 +213,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "db95baaf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((438557, 18), (438557, 18))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pandas_df.shape, pandas_df2.shape" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "327f42bc", "metadata": {}, @@ -192,17 +242,209 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "aa02b8e8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDCODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALNAME_INCOME_TYPENAME_EDUCATION_TYPENAME_FAMILY_STATUSNAME_HOUSING_TYPEDAYS_BIRTHDAYS_EMPLOYEDFLAG_MOBILFLAG_WORK_PHONEFLAG_PHONEFLAG_EMAILOCCUPATION_TYPECNT_FAM_MEMBERS
05008804MYY0427500.0WorkingHigher educationCivil marriageRented apartment-12005-45421100None2.0
15008805MYY0427500.0WorkingHigher educationCivil marriageRented apartment-12005-45421100None2.0
25008806MYY0112500.0WorkingSecondary / secondary specialMarriedHouse / apartment-21474-11341000Security staff2.0
35008808FNY0270000.0Commercial associateSecondary / secondary specialSingle / not marriedHouse / apartment-19110-30511011Sales staff1.0
45008809FNY0270000.0Commercial associateSecondary / secondary specialSingle / not marriedHouse / apartment-19110-30511011Sales staff1.0
\n", + "
" + ], + "text/plain": [ + " ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN \\\n", + "0 5008804 M Y Y 0 \n", + "1 5008805 M Y Y 0 \n", + "2 5008806 M Y Y 0 \n", + "3 5008808 F N Y 0 \n", + "4 5008809 F N Y 0 \n", + "\n", + " AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE \\\n", + "0 427500.0 Working Higher education \n", + "1 427500.0 Working Higher education \n", + "2 112500.0 Working Secondary / secondary special \n", + "3 270000.0 Commercial associate Secondary / secondary special \n", + "4 270000.0 Commercial associate Secondary / secondary special \n", + "\n", + " NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED \\\n", + "0 Civil marriage Rented apartment -12005 -4542 \n", + "1 Civil marriage Rented apartment -12005 -4542 \n", + "2 Married House / apartment -21474 -1134 \n", + "3 Single / not married House / apartment -19110 -3051 \n", + "4 Single / not married House / apartment -19110 -3051 \n", + "\n", + " FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE \\\n", + "0 1 1 0 0 None \n", + "1 1 1 0 0 None \n", + "2 1 0 0 0 Security staff \n", + "3 1 0 1 1 Sales staff \n", + "4 1 0 1 1 Sales staff \n", + "\n", + " CNT_FAM_MEMBERS \n", + "0 2.0 \n", + "1 2.0 \n", + "2 2.0 \n", + "3 1.0 \n", + "4 1.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# snowpark_df.show() <- also possible\n", "snowpark_df.limit(5).to_pandas() # <- collects first 5 rows and displays as pandas-dataframe" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "de7848e9", "metadata": {}, @@ -213,10 +455,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "79a577f5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------------------------------------------\n", + "|\"CODE_GENDER\" |\"NAME_INCOME_TYPE\" |\"DAYS_BIRTH\" |\n", + "-------------------------------------------------------\n", + "|M |Working |-12005 |\n", + "|M |Working |-12005 |\n", + "|M |Working |-21474 |\n", + "|F |Commercial associate |-19110 |\n", + "|F |Commercial associate |-19110 |\n", + "|F |Commercial associate |-19110 |\n", + "|F |Commercial associate |-19110 |\n", + "|F |Pensioner |-22464 |\n", + "|F |Pensioner |-22464 |\n", + "|F |Pensioner |-22464 |\n", + "-------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# snowpark_df = snowpark_df.select('CODE_GENDER','NAME_INCOME_TYPE','DAYS_BIRTH',)\n", "snowpark_df = snowpark_df[['CODE_GENDER','NAME_INCOME_TYPE','DAYS_BIRTH']] # -> pandas-like selection\n", @@ -224,7 +488,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "16c88906", "metadata": {}, @@ -235,10 +498,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "e207ac8f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------\n", + "|\"CODE_GENDER\" |\"NAME_INCOME_TYPE\" |\"DAYS_BIRTH\" |\"AGE\" |\n", + "---------------------------------------------------------------\n", + "|M |Working |-12005 |32 |\n", + "|M |Working |-12005 |32 |\n", + "|M |Working |-21474 |58 |\n", + "|F |Commercial associate |-19110 |52 |\n", + "|F |Commercial associate |-19110 |52 |\n", + "|F |Commercial associate |-19110 |52 |\n", + "|F |Commercial associate |-19110 |52 |\n", + "|F |Pensioner |-22464 |61 |\n", + "|F |Pensioner |-22464 |61 |\n", + "|F |Pensioner |-22464 |61 |\n", + "---------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# Create a new column\n", "# Formula: Absolute Value of DAYS_BIRTH divided by 365 days rounded down\n", @@ -247,7 +532,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "ed4eabd5", "metadata": {}, @@ -258,10 +542,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "8b6057de", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "------------------------------------------------\n", + "|\"CODE_GENDER\" |\"NAME_INCOME_TYPE\" |\"AGE\" |\n", + "------------------------------------------------\n", + "|M |Working |32 |\n", + "|M |Working |32 |\n", + "|M |Working |58 |\n", + "|F |Commercial associate |52 |\n", + "|F |Commercial associate |52 |\n", + "|F |Commercial associate |52 |\n", + "|F |Commercial associate |52 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# Drop a column\n", "snowpark_df = snowpark_df.drop('DAYS_BIRTH')\n", @@ -269,7 +575,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "ea617ddf", "metadata": {}, @@ -279,10 +584,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "7abe304d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------------\n", + "|\"CODE_GENDER\" |\"NAME_INCOME_TYPE\" |\"AGE\" |\n", + "----------------------------------------------\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |55 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "|F |Pensioner |61 |\n", + "----------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# Filter data\n", "snowpark_df = snowpark_df.filter(F.col('NAME_INCOME_TYPE').in_(['Pensioner','Student']))\n", @@ -290,7 +617,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "cbc21227", "metadata": {}, @@ -300,17 +626,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "1a56c822", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n", + "|\"CODE_GENDER\" |\"NAME_INCOME_TYPE\" |\"AVG_AGE\" |\n", + "--------------------------------------------------\n", + "|F |Pensioner |59.188624 |\n", + "|M |Pensioner |57.685482 |\n", + "|F |Student |46.090909 |\n", + "|M |Student |27.166667 |\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "snowpark_df = snowpark_df.group_by(['CODE_GENDER','NAME_INCOME_TYPE']).agg([F.avg('AGE').as_('AVG_AGE')])\n", "snowpark_df.show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "11a1235e", "metadata": {}, @@ -320,10 +661,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "2196027d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------\n", + "|\"CODE_GENDER\" |\"NAME_INCOME_TYPE\" |\"AVG_AGE\" |\n", + "--------------------------------------------------\n", + "|F |Pensioner |59.188624 |\n", + "|M |Pensioner |57.685482 |\n", + "|F |Student |46.090909 |\n", + "|M |Student |27.166667 |\n", + "--------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# Sort data\n", "snowpark_df = snowpark_df.sort(F.col('AVG_AGE').desc())\n", @@ -331,7 +688,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8c5f63dc", "metadata": {}, @@ -343,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "9f2173fb", "metadata": {}, "outputs": [], @@ -353,7 +709,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b7e32fd8", "metadata": {}, @@ -363,17 +718,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "4d4433a5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "438557" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Number of rows in dataset\n", "snowpark_df.count()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5150f9f8", "metadata": {}, @@ -383,10 +748,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "bba56dd6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "438510" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Lets drop duplicates based on ID\n", "snowpark_df = snowpark_df.drop_duplicates('ID')\n", @@ -394,7 +770,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "ae98139b", "metadata": {}, @@ -404,16 +779,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "2f3846b7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT \"ID\", \"CODE_GENDER\", \"FLAG_OWN_CAR\", \"FLAG_OWN_REALTY\", \"CNT_CHILDREN\", \"AMT_INCOME_TOTAL\", \"NAME_INCOME_TYPE\", \"NAME_EDUCATION_TYPE\", \"NAME_FAMILY_STATUS\", \"NAME_HOUSING_TYPE\", \"DAYS_BIRTH\", \"DAYS_EMPLOYED\", \"FLAG_MOBIL\", \"FLAG_WORK_PHONE\", \"FLAG_PHONE\", \"FLAG_EMAIL\", \"OCCUPATION_TYPE\", \"CNT_FAM_MEMBERS\" FROM ( SELECT \"ID\", \"CODE_GENDER\", \"FLAG_OWN_CAR\", \"FLAG_OWN_REALTY\", \"CNT_CHILDREN\", \"AMT_INCOME_TOTAL\", \"NAME_INCOME_TYPE\", \"NAME_EDUCATION_TYPE\", \"NAME_FAMILY_STATUS\", \"NAME_HOUSING_TYPE\", \"DAYS_BIRTH\", \"DAYS_EMPLOYED\", \"FLAG_MOBIL\", \"FLAG_WORK_PHONE\", \"FLAG_PHONE\", \"FLAG_EMAIL\", \"OCCUPATION_TYPE\", \"CNT_FAM_MEMBERS\", row_number() OVER (PARTITION BY \"ID\" ORDER BY \"ID\" ASC NULLS FIRST ) AS \"LCUKWNN3E5\" FROM APPLICATION_RECORD) WHERE (\"LCUKWNN3E5\" = 1 :: INT)\n" + ] + } + ], "source": [ "print(snowpark_df.queries['queries'][0])" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "00849069", "metadata": {}, @@ -423,17 +805,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "3ff508f7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"SUMMARY\" |\"ID\" |\"CODE_GENDER\" |\"FLAG_OWN_CAR\" |\"FLAG_OWN_REALTY\" |\"CNT_CHILDREN\" |\"AMT_INCOME_TOTAL\" |\"NAME_INCOME_TYPE\" |\"NAME_EDUCATION_TYPE\" |\"NAME_FAMILY_STATUS\" |\"NAME_HOUSING_TYPE\" |\"DAYS_BIRTH\" |\"DAYS_EMPLOYED\" |\"FLAG_MOBIL\" |\"FLAG_WORK_PHONE\" |\"FLAG_PHONE\" |\"FLAG_EMAIL\" |\"OCCUPATION_TYPE\" |\"CNT_FAM_MEMBERS\" |\n", + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|mean |6022034.963688 |NULL |NULL |NULL |0.427381 |187525.41572477252 |NULL |NULL |NULL |NULL |-15998.022996 |60566.188769 |1.0 |0.206128 |0.28777 |0.1082 |NULL |2.1944630681170327 |\n", + "|stddev |571496.2397764492 |NULL |NULL |NULL |0.7248737821165834 |110089.27958266277 |NULL |NULL |NULL |NULL |4185.016222145262 |138770.07283492736 |0.0 |0.4045231760974889 |0.45272397771710743 |0.3106332242372023 |NULL |0.8971920109742918 |\n", + "|min |5008804.0 |F |N |N |0.0 |26100.0 |Commercial associate |Academic degree |Civil marriage |Co-op apartment |-25201.0 |-17531.0 |1.0 |0.0 |0.0 |0.0 |Accountants |1.0 |\n", + "|count |438510.0 |438510 |438510 |438510 |438510.0 |438510.0 |438510 |438510 |438510 |438510 |438510.0 |438510.0 |438510.0 |438510.0 |438510.0 |438510.0 |304317 |438510.0 |\n", + "|max |7999952.0 |M |Y |Y |19.0 |6750000.0 |Working |Secondary / secondary special |Widow |With parents |-7489.0 |365243.0 |1.0 |1.0 |1.0 |1.0 |Waiters/barmen staff |20.0 |\n", + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# Calculating various statistics per column\n", "snowpark_df.describe().show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "d8b0fef7", "metadata": {}, @@ -443,10 +841,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "9fefe311", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------------------------------------------------\n", + "|\"NAME_INCOME_TYPE\" |\"CODE_GENDER\" |\"AVG_INCOME\" |\n", + "-------------------------------------------------------------\n", + "|Commercial associate |M |249208.08642289176 |\n", + "|Commercial associate |F |206579.17463258584 |\n", + "|Pensioner |M |169049.77416737832 |\n", + "|Pensioner |F |150729.61255448588 |\n", + "|State servant |M |237034.15414285715 |\n", + "|State servant |F |186152.9842904419 |\n", + "|Student |F |165272.72727272726 |\n", + "|Student |M |149250.0 |\n", + "|Working |M |202170.82427397132 |\n", + "|Working |F |168679.56899413437 |\n", + "-------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# Average Income per Income Type and Gender\n", "analysis_df = snowpark_df.group_by(['NAME_INCOME_TYPE','CODE_GENDER']).agg([F.mean('AMT_INCOME_TOTAL').as_('AVG_INCOME')])\n", @@ -455,7 +875,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "32f3d751", "metadata": {}, @@ -465,10 +884,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "61273d48", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"SUMMARY\" |\"OCCUPATION_TYPE\" |\"ID\" |\"CODE_GENDER\" |\"FLAG_OWN_CAR\" |\"FLAG_OWN_REALTY\" |\"CNT_CHILDREN\" |\"AMT_INCOME_TOTAL\" |\"NAME_INCOME_TYPE\" |\"NAME_EDUCATION_TYPE\" |\"NAME_FAMILY_STATUS\" |\"NAME_HOUSING_TYPE\" |\"DAYS_BIRTH\" |\"DAYS_EMPLOYED\" |\"FLAG_MOBIL\" |\"FLAG_WORK_PHONE\" |\"FLAG_PHONE\" |\"FLAG_EMAIL\" |\"CNT_FAM_MEMBERS\" |\n", + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|mean |NULL |6022034.963688 |NULL |NULL |NULL |0.427381 |187525.41572477252 |NULL |NULL |NULL |NULL |-15998.022996 |60566.188769 |1.0 |0.206128 |0.28777 |0.1082 |2.1944630681170327 |\n", + "|min |Accountants |5008804.0 |F |N |N |0.0 |26100.0 |Commercial associate |Academic degree |Civil marriage |Co-op apartment |-25201.0 |-17531.0 |1.0 |0.0 |0.0 |0.0 |1.0 |\n", + "|stddev |NULL |571496.2397764492 |NULL |NULL |NULL |0.7248737821165834 |110089.27958266277 |NULL |NULL |NULL |NULL |4185.016222145262 |138770.07283492736 |0.0 |0.4045231760974889 |0.45272397771710743 |0.3106332242372023 |0.8971920109742918 |\n", + "|count |438510 |438510.0 |438510 |438510 |438510 |438510.0 |438510.0 |438510 |438510 |438510 |438510 |438510.0 |438510.0 |438510.0 |438510.0 |438510.0 |438510.0 |438510.0 |\n", + "|max |Waiters/barmen staff |7999952.0 |M |Y |Y |19.0 |6750000.0 |Working |Secondary / secondary special |Widow |With parents |-7489.0 |365243.0 |1.0 |1.0 |1.0 |1.0 |20.0 |\n", + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "my_imputer = SimpleImputer(input_cols=['OCCUPATION_TYPE'], output_cols=['OCCUPATION_TYPE'] ,strategy='most_frequent')\n", "my_imputer.fit(snowpark_df)\n", @@ -477,7 +913,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "991beaa4", "metadata": {}, @@ -487,16 +922,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "bac12f72", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT iff(\"OCCUPATION_TYPE\" IS NULL, 'Laborers', \"OCCUPATION_TYPE\") AS \"OCCUPATION_TYPE\", \"ID\", \"CODE_GENDER\", \"FLAG_OWN_CAR\", \"FLAG_OWN_REALTY\", \"CNT_CHILDREN\", \"AMT_INCOME_TOTAL\", \"NAME_INCOME_TYPE\", \"NAME_EDUCATION_TYPE\", \"NAME_FAMILY_STATUS\", \"NAME_HOUSING_TYPE\", \"DAYS_BIRTH\", \"DAYS_EMPLOYED\", \"FLAG_MOBIL\", \"FLAG_WORK_PHONE\", \"FLAG_PHONE\", \"FLAG_EMAIL\", \"CNT_FAM_MEMBERS\" FROM ( SELECT \"ID\", \"CODE_GENDER\", \"FLAG_OWN_CAR\", \"FLAG_OWN_REALTY\", \"CNT_CHILDREN\", \"AMT_INCOME_TOTAL\", \"NAME_INCOME_TYPE\", \"NAME_EDUCATION_TYPE\", \"NAME_FAMILY_STATUS\", \"NAME_HOUSING_TYPE\", \"DAYS_BIRTH\", \"DAYS_EMPLOYED\", \"FLAG_MOBIL\", \"FLAG_WORK_PHONE\", \"FLAG_PHONE\", \"FLAG_EMAIL\", \"CNT_FAM_MEMBERS\", \"OCCUPATION_TYPE\" AS \"OCCUPATION_TYPE\" FROM ( SELECT \"ID\", \"CODE_GENDER\", \"FLAG_OWN_CAR\", \"FLAG_OWN_REALTY\", \"CNT_CHILDREN\", \"AMT_INCOME_TOTAL\", \"NAME_INCOME_TYPE\", \"NAME_EDUCATION_TYPE\", \"NAME_FAMILY_STATUS\", \"NAME_HOUSING_TYPE\", \"DAYS_BIRTH\", \"DAYS_EMPLOYED\", \"FLAG_MOBIL\", \"FLAG_WORK_PHONE\", \"FLAG_PHONE\", \"FLAG_EMAIL\", \"OCCUPATION_TYPE\", \"CNT_FAM_MEMBERS\", row_number() OVER (PARTITION BY \"ID\" ORDER BY \"ID\" ASC NULLS FIRST ) AS \"LCUKWNN3E5\" FROM APPLICATION_RECORD) WHERE (\"LCUKWNN3E5\" = 1 :: INT))\n" + ] + } + ], "source": [ "print(snowpark_df.queries['queries'][0])" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5b8d26f8", "metadata": {}, @@ -506,16 +948,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "13cca9d1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "StructType([StructField('OCCUPATION_TYPE', StringType(16777216), nullable=True), StructField('ID', LongType(), nullable=True), StructField('CODE_GENDER', StringType(16777216), nullable=True), StructField('FLAG_OWN_CAR', StringType(16777216), nullable=True), StructField('FLAG_OWN_REALTY', StringType(16777216), nullable=True), StructField('CNT_CHILDREN', LongType(), nullable=True), StructField('AMT_INCOME_TOTAL', DoubleType(), nullable=True), StructField('NAME_INCOME_TYPE', StringType(16777216), nullable=True), StructField('NAME_EDUCATION_TYPE', StringType(16777216), nullable=True), StructField('NAME_FAMILY_STATUS', StringType(16777216), nullable=True), StructField('NAME_HOUSING_TYPE', StringType(16777216), nullable=True), StructField('DAYS_BIRTH', LongType(), nullable=True), StructField('DAYS_EMPLOYED', LongType(), nullable=True), StructField('FLAG_MOBIL', LongType(), nullable=True), StructField('FLAG_WORK_PHONE', LongType(), nullable=True), StructField('FLAG_PHONE', LongType(), nullable=True), StructField('FLAG_EMAIL', LongType(), nullable=True), StructField('CNT_FAM_MEMBERS', DoubleType(), nullable=True)])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "snowpark_df.schema" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5b2ae444", "metadata": {}, @@ -525,10 +977,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "17735cdb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['OCCUPATION_TYPE',\n", + " 'CODE_GENDER',\n", + " 'FLAG_OWN_CAR',\n", + " 'FLAG_OWN_REALTY',\n", + " 'NAME_INCOME_TYPE',\n", + " 'NAME_EDUCATION_TYPE',\n", + " 'NAME_FAMILY_STATUS',\n", + " 'NAME_HOUSING_TYPE']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get all categorical columns\n", "categorical_types = [T.StringType]\n", @@ -537,7 +1007,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "e1c1ba4c", "metadata": {}, @@ -547,10 +1016,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "682ae682", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
COLUMN_NAMENUM_UNIQUE_VALUES
0OCCUPATION_TYPE18
1CODE_GENDER2
2FLAG_OWN_CAR2
3FLAG_OWN_REALTY2
4NAME_INCOME_TYPE5
5NAME_EDUCATION_TYPE5
6NAME_FAMILY_STATUS5
7NAME_HOUSING_TYPE6
\n", + "
" + ], + "text/plain": [ + " COLUMN_NAME NUM_UNIQUE_VALUES\n", + "0 OCCUPATION_TYPE 18\n", + "1 CODE_GENDER 2\n", + "2 FLAG_OWN_CAR 2\n", + "3 FLAG_OWN_REALTY 2\n", + "4 NAME_INCOME_TYPE 5\n", + "5 NAME_EDUCATION_TYPE 5\n", + "6 NAME_FAMILY_STATUS 5\n", + "7 NAME_HOUSING_TYPE 6" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Number of unique values per categorical column\n", "unique_values = []\n", @@ -560,7 +1116,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "bc3c3a6a", "metadata": {}, @@ -573,10 +1128,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "995c4a8d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"OCCUPATION_TYPE\" |\"ID\" |\"CODE_GENDER\" |\"FLAG_OWN_CAR\" |\"FLAG_OWN_REALTY\" |\"CNT_CHILDREN\" |\"AMT_INCOME_TOTAL\" |\"NAME_INCOME_TYPE\" |\"NAME_EDUCATION_TYPE\" |\"NAME_FAMILY_STATUS\" |\"NAME_HOUSING_TYPE\" |\"DAYS_BIRTH\" |\"DAYS_EMPLOYED\" |\"FLAG_MOBIL\" |\"FLAG_WORK_PHONE\" |\"FLAG_PHONE\" |\"FLAG_EMAIL\" |\"CNT_FAM_MEMBERS\" |\n", + "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|Core staff |6754942 |F |N |Y |0 |135000.0 |Working |Higher education |Civil marriage |House / apartment |-11745 |-101 |1 |0 |1 |0 |2.0 |\n", + "|Sales staff |6736592 |F |N |Y |0 |112500.0 |Commercial associate |Secondary / secondary special |Single / not married |House / apartment |-8398 |-578 |1 |1 |1 |1 |1.0 |\n", + "|Laborers |5474464 |F |N |N |0 |135000.0 |Pensioner |Secondary / secondary special |Widow |House / apartment |-22064 |365243 |1 |0 |0 |0 |1.0 |\n", + "|Sales staff |6460419 |F |N |Y |1 |67500.0 |Commercial associate |Secondary / secondary special |Separated |House / apartment |-13510 |-3852 |1 |0 |0 |0 |2.0 |\n", + "|Sales staff |6448814 |M |N |Y |0 |166500.0 |Commercial associate |Secondary / secondary special |Civil marriage |House / apartment |-8137 |-610 |1 |1 |1 |0 |2.0 |\n", + "|Managers |7097047 |M |Y |Y |1 |450000.0 |Working |Secondary / secondary special |Married |House / apartment |-15862 |-1463 |1 |0 |0 |0 |3.0 |\n", + "|Laborers |5090669 |F |Y |Y |0 |225000.0 |Working |Secondary / secondary special |Married |House / apartment |-17821 |-1830 |1 |0 |0 |0 |2.0 |\n", + "|Sales staff |5901250 |F |N |N |0 |292500.0 |Working |Higher education |Married |House / apartment |-14458 |-3882 |1 |0 |0 |1 |2.0 |\n", + "|Security staff |6652097 |M |N |Y |0 |144000.0 |Working |Secondary / secondary special |Married |House / apartment |-12652 |-845 |1 |1 |0 |0 |2.0 |\n", + "|Laborers |6231048 |F |N |Y |1 |130500.0 |Working |Higher education |Married |House / apartment |-11646 |-218 |1 |0 |0 |0 |3.0 |\n", + "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "snowpark_df.write.save_as_table(table_name='MY_FIRST_ANALYSIS', mode='overwrite')\n", "session.table('MY_FIRST_ANALYSIS').show()" @@ -584,13 +1161,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "a4dc8b69", "metadata": {}, "outputs": [], "source": [ "session.close()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "816763ea-f410-4f25-a7ca-b60f3d32d897", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -609,7 +1194,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/hol/1_3_DEMO_full_data_exploration_transformation.ipynb b/hol/1_3_DEMO_full_data_exploration_transformation.ipynb index 395ce78..4fb8ea0 100644 --- a/hol/1_3_DEMO_full_data_exploration_transformation.ipynb +++ b/hol/1_3_DEMO_full_data_exploration_transformation.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "3b648ee8", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2e5d9443", "metadata": {}, @@ -19,7 +17,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1ed15903", "metadata": {}, @@ -29,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "7410ce0d", "metadata": {}, "outputs": [], @@ -39,7 +36,7 @@ "import snowflake.snowpark.functions as F\n", "import snowflake.snowpark.types as T\n", "from snowflake.snowpark.window import Window\n", - "from snowflake.ml.preprocessing import *\n", + "from snowflake.ml.modeling.preprocessing import *\n", "\n", "import json\n", "import pandas as pd\n", @@ -47,7 +44,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "183ef59f", "metadata": {}, @@ -57,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e0abcd13", "metadata": {}, "outputs": [], @@ -68,16 +64,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f4274fc5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'snowflake-snowpark-python': '1.5.1',\n", + " 'pandas': '1.5.3',\n", + " 'scikit-learn': '1.2.2',\n", + " 'lightgbm': '3.3.5',\n", + " 'xgboost': '1.7.3',\n", + " 'joblib': '1.2.0',\n", + " 'imbalanced-learn': '0.10.1'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packages_version" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a1f95920", "metadata": {}, @@ -87,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "d2c8fbaa", "metadata": {}, "outputs": [], @@ -98,10 +110,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "49090e02", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Database and schema: \"HOL_DB\".\"PUBLIC\"\n", + "Current Warehouse: \"HOL_WH\"\n" + ] + } + ], "source": [ "session = Session.builder.configs(connection_parameters).create()\n", "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", @@ -109,7 +130,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "fd745008", "metadata": {}, @@ -120,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2242ced2", "metadata": {}, "outputs": [], @@ -131,7 +151,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "f2be07e3", "metadata": {}, @@ -141,17 +160,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "57c73d02", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "438557" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# How many records do we have?\n", "application_record_sdf.count()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "399d3874", "metadata": {}, @@ -162,17 +191,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "8f236685", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number Duplicates: 47\n" + ] + } + ], "source": [ "duplicates_sdf = application_record_sdf.group_by('ID').agg(F.count(('ID'))).filter(F.col('COUNT(ID)') > 1)\n", "print('Number Duplicates:', duplicates_sdf.count())" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a4a0c268", "metadata": {}, @@ -182,17 +218,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "98a1205b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "438510" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "application_record_sdf = application_record_sdf.drop_duplicates('ID')\n", "application_record_sdf.count()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "7ec5d998", "metadata": {}, @@ -208,38 +254,254 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "567aff66", "metadata": {}, - "outputs": [], - "source": [ - "application_record_sdf.describe().show()" + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUMMARYIDCODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALNAME_INCOME_TYPENAME_EDUCATION_TYPENAME_FAMILY_STATUSNAME_HOUSING_TYPEDAYS_BIRTHDAYS_EMPLOYEDFLAG_MOBILFLAG_WORK_PHONEFLAG_PHONEFLAG_EMAILOCCUPATION_TYPECNT_FAM_MEMBERS
0mean6.022035e+06NoneNoneNone0.4273811.875254e+05NoneNoneNoneNone-15998.02299660566.1887691.00.2061280.2877700.108200None2.194463
1count4.385100e+05438510438510438510438510.0000004.385100e+05438510438510438510438510438510.000000438510.000000438510.0438510.000000438510.000000438510.000000304317438510.000000
2stddev5.714962e+05NoneNoneNone0.7248741.100893e+05NoneNoneNoneNone4185.016222138770.0728350.00.4045230.4527240.310633None0.897192
3min5.008804e+06FNN0.0000002.610000e+04Commercial associateAcademic degreeCivil marriageCo-op apartment-25201.000000-17531.0000001.00.0000000.0000000.000000Accountants1.000000
4max7.999952e+06MYY19.0000006.750000e+06WorkingSecondary / secondary specialWidowWith parents-7489.000000365243.0000001.01.0000001.0000001.000000Waiters/barmen staff20.000000
\n", + "
" + ], + "text/plain": [ + " SUMMARY ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY \\\n", + "0 mean 6.022035e+06 None None None \n", + "1 count 4.385100e+05 438510 438510 438510 \n", + "2 stddev 5.714962e+05 None None None \n", + "3 min 5.008804e+06 F N N \n", + "4 max 7.999952e+06 M Y Y \n", + "\n", + " CNT_CHILDREN AMT_INCOME_TOTAL NAME_INCOME_TYPE \\\n", + "0 0.427381 1.875254e+05 None \n", + "1 438510.000000 4.385100e+05 438510 \n", + "2 0.724874 1.100893e+05 None \n", + "3 0.000000 2.610000e+04 Commercial associate \n", + "4 19.000000 6.750000e+06 Working \n", + "\n", + " NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE \\\n", + "0 None None None \n", + "1 438510 438510 438510 \n", + "2 None None None \n", + "3 Academic degree Civil marriage Co-op apartment \n", + "4 Secondary / secondary special Widow With parents \n", + "\n", + " DAYS_BIRTH DAYS_EMPLOYED FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE \\\n", + "0 -15998.022996 60566.188769 1.0 0.206128 0.287770 \n", + "1 438510.000000 438510.000000 438510.0 438510.000000 438510.000000 \n", + "2 4185.016222 138770.072835 0.0 0.404523 0.452724 \n", + "3 -25201.000000 -17531.000000 1.0 0.000000 0.000000 \n", + "4 -7489.000000 365243.000000 1.0 1.000000 1.000000 \n", + "\n", + " FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS \n", + "0 0.108200 None 2.194463 \n", + "1 438510.000000 304317 438510.000000 \n", + "2 0.310633 None 0.897192 \n", + "3 0.000000 Accountants 1.000000 \n", + "4 1.000000 Waiters/barmen staff 20.000000 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# application_record_sdf.describe().show()\n", + "application_record_sdf.describe().to_pandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8fcd6862", "metadata": {}, "source": [ "### Missing Value Imputation\n", "The describe output show that we have missing values in OCCUPATION_TYPE, it's count are less (304317) than the total count (438557) \n", - "We will use the **fillna** ,ethod to replace missing values in OCCUPATION_TYPE with 'OTHER', since they are so many we creates a new category for it." + "We will use the **fillna** method to replace missing values in OCCUPATION_TYPE with 'OTHER', since they are so many we creates a new category for it." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "b292ddf9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"SUMMARY\" |\"ID\" |\"CODE_GENDER\" |\"FLAG_OWN_CAR\" |\"FLAG_OWN_REALTY\" |\"CNT_CHILDREN\" |\"AMT_INCOME_TOTAL\" |\"NAME_INCOME_TYPE\" |\"NAME_EDUCATION_TYPE\" |\"NAME_FAMILY_STATUS\" |\"NAME_HOUSING_TYPE\" |\"DAYS_BIRTH\" |\"DAYS_EMPLOYED\" |\"FLAG_MOBIL\" |\"FLAG_WORK_PHONE\" |\"FLAG_PHONE\" |\"FLAG_EMAIL\" |\"OCCUPATION_TYPE\" |\"CNT_FAM_MEMBERS\" |\n", + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|count |438510.0 |438510 |438510 |438510 |438510.0 |438510.0 |438510 |438510 |438510 |438510 |438510.0 |438510.0 |438510.0 |438510.0 |438510.0 |438510.0 |438510 |438510.0 |\n", + "|min |5008804.0 |F |N |N |0.0 |26100.0 |Commercial associate |Academic degree |Civil marriage |Co-op apartment |-25201.0 |-17531.0 |1.0 |0.0 |0.0 |0.0 |Accountants |1.0 |\n", + "|stddev |571496.2397764492 |NULL |NULL |NULL |0.7248737821165834 |110089.27958266283 |NULL |NULL |NULL |NULL |4185.016222145262 |138770.07283492736 |0.0 |0.4045231760974889 |0.45272397771710743 |0.3106332242372023 |NULL |0.8971920109742918 |\n", + "|mean |6022034.963688 |NULL |NULL |NULL |0.427381 |187525.41572477252 |NULL |NULL |NULL |NULL |-15998.022996 |60566.188769 |1.0 |0.206128 |0.28777 |0.1082 |NULL |2.1944630681170327 |\n", + "|max |7999952.0 |M |Y |Y |19.0 |6750000.0 |Working |Secondary / secondary special |Widow |With parents |-7489.0 |365243.0 |1.0 |1.0 |1.0 |1.0 |Waiters/barmen staff |20.0 |\n", + "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "application_record_sdf = application_record_sdf.fillna(value='OTHER', subset=['OCCUPATION_TYPE'])\n", - "application_record_sdf.describe().show()" + "application_record_sdf.describe().show()\n", + "# application_record_sdf.describe().to_pandas() # for better display format" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8ab67b19", "metadata": {}, @@ -252,10 +514,157 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "724dd8a1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
COLUMN_NAMENUM_UNIQUE_VALUES
0ID438510
1CODE_GENDER2
2FLAG_OWN_CAR2
3FLAG_OWN_REALTY2
4CNT_CHILDREN12
5AMT_INCOME_TOTAL866
6NAME_INCOME_TYPE5
7NAME_EDUCATION_TYPE5
8NAME_FAMILY_STATUS5
9NAME_HOUSING_TYPE6
10DAYS_BIRTH16379
11DAYS_EMPLOYED9406
12FLAG_MOBIL1
13FLAG_WORK_PHONE2
14FLAG_PHONE2
15FLAG_EMAIL2
16OCCUPATION_TYPE19
17CNT_FAM_MEMBERS13
\n", + "
" + ], + "text/plain": [ + " COLUMN_NAME NUM_UNIQUE_VALUES\n", + "0 ID 438510\n", + "1 CODE_GENDER 2\n", + "2 FLAG_OWN_CAR 2\n", + "3 FLAG_OWN_REALTY 2\n", + "4 CNT_CHILDREN 12\n", + "5 AMT_INCOME_TOTAL 866\n", + "6 NAME_INCOME_TYPE 5\n", + "7 NAME_EDUCATION_TYPE 5\n", + "8 NAME_FAMILY_STATUS 5\n", + "9 NAME_HOUSING_TYPE 6\n", + "10 DAYS_BIRTH 16379\n", + "11 DAYS_EMPLOYED 9406\n", + "12 FLAG_MOBIL 1\n", + "13 FLAG_WORK_PHONE 2\n", + "14 FLAG_PHONE 2\n", + "15 FLAG_EMAIL 2\n", + "16 OCCUPATION_TYPE 19\n", + "17 CNT_FAM_MEMBERS 13" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "unique_values = []\n", "for column in application_record_sdf.columns:\n", @@ -264,7 +673,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "4b361107", "metadata": {}, @@ -274,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "b004a868", "metadata": {}, "outputs": [], @@ -283,7 +691,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c232616e", "metadata": {}, @@ -310,16 +717,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "5e765eec", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"STATUS\" |\"COUNT\" |\n", + "----------------------\n", + "|0 |383120 |\n", + "|1 |11090 |\n", + "|2 |868 |\n", + "|3 |320 |\n", + "|4 |223 |\n", + "|5 |1693 |\n", + "|C |442031 |\n", + "|X |209230 |\n", + "----------------------\n", + "\n" + ] + } + ], "source": [ "credit_record_sdf.group_by('STATUS').count().sort('STATUS').show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8c78128a", "metadata": {}, @@ -331,10 +757,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "1fb73f51", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"ID\" |\"TARGET\" |\n", + "----------------------\n", + "|5001711 |0 |\n", + "|5001712 |0 |\n", + "|5001713 |0 |\n", + "|5001714 |0 |\n", + "|5001715 |0 |\n", + "|5001717 |0 |\n", + "|5001718 |0 |\n", + "|5001719 |0 |\n", + "|5001720 |0 |\n", + "|5001723 |0 |\n", + "----------------------\n", + "\n" + ] + } + ], "source": [ "cpunt = credit_record_sdf.group_by('ID')\\\n", " .agg(F.sum(F.iff(F.col('STATUS').in_(['2', '3','4','5']), 1, 0)).as_(\"CNT_LATE\"))\\\n", @@ -344,7 +792,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "68100059", "metadata": {}, @@ -355,16 +802,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "212c2392", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "------------------------------\n", + "|\"TARGET\" |\"COUNT(TARGET)\" |\n", + "------------------------------\n", + "|0 |45318 |\n", + "|1 |667 |\n", + "------------------------------\n", + "\n" + ] + } + ], "source": [ "cpunt.group_by('TARGET').agg(F.count('TARGET')).show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "668c30ff", "metadata": {}, @@ -374,10 +834,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "05e2e118", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "36457" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "application_record_sdf = application_record_sdf.join(cpunt, using_columns='ID', join_type='inner')\n", "\n", @@ -386,16 +857,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "b84a77a5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|\"ID\" |\"CODE_GENDER\" |\"FLAG_OWN_CAR\" |\"FLAG_OWN_REALTY\" |\"CNT_CHILDREN\" |\"AMT_INCOME_TOTAL\" |\"NAME_INCOME_TYPE\" |\"NAME_EDUCATION_TYPE\" |\"NAME_FAMILY_STATUS\" |\"NAME_HOUSING_TYPE\" |\"DAYS_BIRTH\" |\"DAYS_EMPLOYED\" |\"FLAG_WORK_PHONE\" |\"FLAG_PHONE\" |\"FLAG_EMAIL\" |\"OCCUPATION_TYPE\" |\"CNT_FAM_MEMBERS\" |\"TARGET\" |\n", + "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "|5111190 |F |N |Y |2 |247500.0 |Commercial associate |Secondary / secondary special |Married |House / apartment |-13019 |-2307 |0 |0 |0 |OTHER |4.0 |0 |\n", + "|5140183 |F |N |Y |0 |67500.0 |State servant |Secondary / secondary special |Married |Municipal apartment |-15161 |-7190 |0 |0 |0 |Core staff |2.0 |0 |\n", + "|5033998 |F |N |N |0 |112500.0 |Commercial associate |Secondary / secondary special |Separated |Municipal apartment |-16417 |-1615 |0 |1 |0 |Laborers |1.0 |0 |\n", + "|5099974 |F |N |N |0 |180000.0 |Working |Secondary / secondary special |Civil marriage |House / apartment |-10218 |-1740 |0 |1 |0 |Security staff |2.0 |0 |\n", + "|5095056 |F |Y |Y |0 |157500.0 |Pensioner |Secondary / secondary special |Married |House / apartment |-22684 |365243 |0 |0 |0 |OTHER |2.0 |0 |\n", + "|5105697 |F |Y |N |0 |225000.0 |Working |Secondary / secondary special |Married |Municipal apartment |-12155 |-667 |0 |0 |0 |Laborers |2.0 |0 |\n", + "|5024230 |F |Y |Y |0 |540000.0 |Commercial associate |Higher education |Married |House / apartment |-15702 |-185 |0 |1 |0 |OTHER |2.0 |0 |\n", + "|5029014 |F |N |Y |0 |112500.0 |Working |Secondary / secondary special |Married |House / apartment |-13868 |-415 |0 |0 |0 |Laborers |2.0 |0 |\n", + "|5149342 |F |Y |Y |0 |292500.0 |Commercial associate |Higher education |Single / not married |House / apartment |-16930 |-8995 |0 |0 |0 |High skill tech staff |1.0 |0 |\n", + "|5061967 |F |N |Y |0 |225000.0 |Commercial associate |Secondary / secondary special |Married |House / apartment |-22897 |-1812 |0 |0 |0 |Managers |2.0 |0 |\n", + "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "application_record_sdf.show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "39a059e3", "metadata": {}, @@ -425,7 +917,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "171e11d9", "metadata": {}, @@ -436,10 +927,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "bfceb0de", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "------------------------------------------\n", + "|\"CNT_CHILDREN\" |\"COUNT\" |\"HIGH_RISK\" |\n", + "------------------------------------------\n", + "|0 |25201 |431 |\n", + "|1 |7492 |120 |\n", + "|2 |3256 |52 |\n", + "|3 |419 |12 |\n", + "|4 |63 |1 |\n", + "|5 |20 |0 |\n", + "|7 |2 |0 |\n", + "|14 |3 |0 |\n", + "|19 |1 |0 |\n", + "------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# High Risk Ratio for CNT_CHILDREN\n", "var_analysis = application_record_sdf.group_by('CNT_CHILDREN')\n", @@ -449,7 +961,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5e9776df", "metadata": {}, @@ -459,10 +970,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "b5e05ace", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------\n", + "|\"ID\" |\"CNT_CHILDREN_IND\" |\n", + "--------------------------------\n", + "|5088857 |2More |\n", + "|5115581 |0 |\n", + "|5068783 |0 |\n", + "|5137375 |1 |\n", + "|5085814 |0 |\n", + "|5150182 |0 |\n", + "|5024092 |0 |\n", + "|5088820 |0 |\n", + "|5056032 |0 |\n", + "|5028780 |0 |\n", + "--------------------------------\n", + "\n" + ] + } + ], "source": [ "application_record_sdf = application_record_sdf.with_column('CNT_CHILDREN_IND', \n", " F.iff(F.col('CNT_CHILDREN') >= 2, \n", @@ -475,7 +1008,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b5aa79fb", "metadata": {}, @@ -486,10 +1018,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "261296ae", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------\n", + "|\"CNT_FAM_MEMBERS\" |\"COUNT\" |\"HIGH_RISK\" |\n", + "---------------------------------------------\n", + "|1.0 |6987 |139 |\n", + "|2.0 |19463 |313 |\n", + "|3.0 |6421 |105 |\n", + "|4.0 |3106 |46 |\n", + "|5.0 |397 |12 |\n", + "|6.0 |58 |1 |\n", + "|7.0 |19 |0 |\n", + "|9.0 |2 |0 |\n", + "|15.0 |3 |0 |\n", + "|20.0 |1 |0 |\n", + "---------------------------------------------\n", + "\n", + "-----------------------------------\n", + "|\"ID\" |\"CNT_FAM_MEMBERS_IND\" |\n", + "-----------------------------------\n", + "|5088857 |3More |\n", + "|5115581 |2 |\n", + "|5068783 |1 |\n", + "|5137375 |3More |\n", + "|5085814 |2 |\n", + "|5150182 |2 |\n", + "|5024092 |1 |\n", + "|5088820 |1 |\n", + "|5056032 |2 |\n", + "|5028780 |2 |\n", + "-----------------------------------\n", + "\n" + ] + } + ], "source": [ "var_analysis = application_record_sdf.group_by('CNT_FAM_MEMBERS')\n", "var_analysis = var_analysis.agg([F.count('CNT_FAM_MEMBERS').as_('COUNT'), \n", @@ -509,7 +1078,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "f8379a5e", "metadata": {}, @@ -524,10 +1092,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "7fa5667c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Calculate the age given the number of days\n", "application_record_sdf = application_record_sdf.with_column('AGE', F.abs(F.floor(F.col('DAYS_BIRTH') / 365)))\n", @@ -539,7 +1128,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "3225a9ff", "metadata": {}, @@ -549,10 +1137,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "d5a4ce19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Calculate the workyears given the number of days of employment\n", "application_record_sdf = application_record_sdf.with_column('WORKYEAR', F.abs(F.floor(F.col('DAYS_EMPLOYED') / 365)))\n", @@ -565,7 +1164,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a673e237", "metadata": {}, @@ -576,10 +1174,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "e0ae6250", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----------------------------------\n", + "|\"OCCUPATION_TYPE\" |\"COUNT\" |\n", + "-----------------------------------\n", + "|IT staff |60 |\n", + "|Realty agents |79 |\n", + "|HR staff |85 |\n", + "|Secretaries |151 |\n", + "|Waiters/barmen staff |174 |\n", + "|Low-skill Laborers |175 |\n", + "|Private service staff |344 |\n", + "|Cleaning staff |551 |\n", + "|Security staff |592 |\n", + "|Cooking staff |655 |\n", + "-----------------------------------\n", + "\n", + "---------------------\n", + "|\"OCCUPATION_TYPE\" |\n", + "---------------------\n", + "|OTHER |\n", + "|OTHER |\n", + "|LABOURWORK |\n", + "|OFFICEWORK |\n", + "|OFFICEWORK |\n", + "|OFFICEWORK |\n", + "|OFFICEWORK |\n", + "|LABOURWORK |\n", + "|LABOURWORK |\n", + "|LABOURWORK |\n", + "---------------------\n", + "\n" + ] + } + ], "source": [ "var_analysis = application_record_sdf.group_by('OCCUPATION_TYPE').agg(F.count('OCCUPATION_TYPE').as_('COUNT'))\n", "var_analysis = var_analysis.sort('COUNT').show()\n", @@ -617,7 +1252,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8b7436dd", "metadata": {}, @@ -627,7 +1261,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "33e55255", "metadata": {}, "outputs": [], @@ -637,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "6385ad52", "metadata": {}, "outputs": [], @@ -651,10 +1285,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "a5034017", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/oazarmanesh/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/session.py:1383: UserWarning: Pandas Dataframe has non-standard index of type which will not be written. Consider changing the index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s)\n", + " success, nchunks, nrows, ci_output = write_pandas(\n", + "/Users/oazarmanesh/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/session.py:1383: UserWarning: Pandas Dataframe has non-standard index of type which will not be written. Consider changing the index to pd.RangeIndex(start=0,...,step=1) or call reset_index() to keep index as column(s)\n", + " success, nchunks, nrows, ci_output = write_pandas(\n" + ] + } + ], "source": [ "# Prepare values for One-Hot-Encoding\n", "ohe_cols = ['CODE_GENDER','NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'CNT_CHILDREN_IND', 'CNT_FAM_MEMBERS_IND', 'OCCUPATION_TYPE','NAME_HOUSING_TYPE']\n", @@ -672,7 +1317,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "5f8d0cf5", "metadata": {}, "outputs": [], @@ -686,16 +1331,442 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "73ddfb03", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "textn", + "|\"AMT_INCOME_TOTAL\" |\"AGE\" |\"WORKYEAR\" |\"CODE_GENDER_F\" |\"CODE_GENDER_M\" |\"NAME_INCOME_TYPE_COMMERCIAL_ASSOCIATE\" |\"NAME_INCOME_TYPE_PENSIONER\" |\"NAME_INCOME_TYPE_STATE_SERVANT\" |\"NAME_INCOME_TYPE_STUDENT\" |\"NAME_INCOME_TYPE_WORKING\" |\"NAME_EDUCATION_TYPE_ACADEMIC_DEGREE\" |\"NAME_EDUCATION_TYPE_HIGHER_EDUCATION\" |\"NAME_EDUCATION_TYPE_INCOMPLETE_HIGHER\" |\"NAME_EDUCATION_TYPE_LOWER_SECONDARY\" |\"NAME_EDUCATION_TYPE_SECONDARY_SECONDARY_SPECIAL\" |\"NAME_FAMILY_STATUS_CIVIL_MARRIAGE\" |\"NAME_FAMILY_STATUS_MARRIED\" |\"NAME_FAMILY_STATUS_SEPARATED\" |\"NAME_FAMILY_STATUS_SINGLE_NOT_MARRIED\" |\"NAME_FAMILY_STATUS_WIDOW\" |\"CNT_CHILDREN_IND_0\" |\"CNT_CHILDREN_IND_1\" |\"CNT_CHILDREN_IND_2MORE\" |\"CNT_FAM_MEMBERS_IND_1\" |\"CNT_FAM_MEMBERS_IND_2\" |\"CNT_FAM_MEMBERS_IND_3MORE\" |\"OCCUPATION_TYPE_HIGHTECHWORK\" |\"OCCUPATION_TYPE_LABOURWORK\" |\"OCCUPATION_TYPE_OFFICEWORK\" |\"OCCUPATION_TYPE_OTHER\" |\"NAME_HOUSING_TYPE_CO_OP_APARTMENT\" |\"NAME_HOUSING_TYPE_HOUSE_APARTMENT\" |\"NAME_HOUSING_TYPE_MUNICIPAL_APARTMENT\" |\"NAME_HOUSING_TYPE_OFFICE_APARTMENT\" |\"NAME_HOUSING_TYPE_RENTED_APARTMENT\" |\"NAME_HOUSING_TYPE_WITH_PARENTS\" |\"FLAG_OWN_CAR\" |\"FLAG_OWN_REALTY\" |\"ID\" |\"TARGET\" |\nn", + "|0.723608095562434 |1.2441583640019152 |0.19336128058914914 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |5008810 |0 |\n", + "|-0.7837771442667844 |1.660494591017464 |0.8118155727798374 |0.0 |1.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |1.0 |5008844 |0 |\n", + "|-0.7837771442667844 |1.660494591017464 |0.8118155727798374 |0.0 |1.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |1.0 |5008849 |0 |\n", + "|-0.6545726951385656 |-0.629354657568054 |-0.42509301160153906 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |5008873 |0 |\n", + "|0.723608095562434 |-0.629354657568054 |-0.5797065846492111 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |1.0 |5008837 |0 |\n", + "|-0.6545726951385656 |-0.629354657568054 |-0.42509301160153906 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |5008880 |0 |\n", + "|0.5082673470154029 |0.6196540234785921 |-0.5797065846492111 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |5008888 |0 |\n", + "|0.5082673470154029 |0.6196540234785921 |-0.5797065846492111 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |5008889 |0 |\n", + "|0.9820169938188714 |0.20331779646304343 |0.19336128058914914 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |5008909 |0 |\n", + "|0.9820169938188714 |0.20331779646304343 |0.19336128058914914 |1.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |1.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |0.0 |0.0 |0.0 |0.0 |1.0 |0.0 |0.0 |1.0 |5008915 |0 |\nn", + "\n" + ] + } + ], "source": [ "application_record_sdf.show()" ] }, { - "attachments": {}, + "cell_type": "code", + "execution_count": 30, + "id": "9b89e7e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AMT_INCOME_TOTALAGEWORKYEARCODE_GENDER_FCODE_GENDER_MNAME_INCOME_TYPE_COMMERCIAL_ASSOCIATENAME_INCOME_TYPE_PENSIONERNAME_INCOME_TYPE_STATE_SERVANTNAME_INCOME_TYPE_STUDENTNAME_INCOME_TYPE_WORKING...NAME_HOUSING_TYPE_CO_OP_APARTMENTNAME_HOUSING_TYPE_HOUSE_APARTMENTNAME_HOUSING_TYPE_MUNICIPAL_APARTMENTNAME_HOUSING_TYPE_OFFICE_APARTMENTNAME_HOUSING_TYPE_RENTED_APARTMENTNAME_HOUSING_TYPE_WITH_PARENTSFLAG_OWN_CARFLAG_OWN_REALTYIDTARGET
00.7236081.2441580.1933611.00.01.00.00.00.00.0...0.01.00.00.00.00.00.01.050088100
1-0.7837771.6604950.8118160.01.01.00.00.00.00.0...0.01.00.00.00.00.01.01.050088440
2-0.7837771.6604950.8118160.01.01.00.00.00.00.0...0.01.00.00.00.00.01.01.050088490
3-0.654573-0.629355-0.4250931.00.01.00.00.00.00.0...0.01.00.00.00.00.00.01.050088730
40.723608-0.629355-0.5797070.01.00.00.00.00.01.0...0.01.00.00.00.00.01.01.050088370
5-0.654573-0.629355-0.4250931.00.01.00.00.00.00.0...0.01.00.00.00.00.00.01.050088800
60.5082670.619654-0.5797071.00.01.00.00.00.00.0...0.00.00.00.01.00.00.01.050088880
70.5082670.619654-0.5797071.00.01.00.00.00.00.0...0.00.00.00.01.00.00.01.050088890
80.9820170.2033180.1933611.00.01.00.00.00.00.0...0.00.00.00.01.00.00.01.050089090
90.9820170.2033180.1933611.00.01.00.00.00.00.0...0.00.00.00.01.00.00.01.050089150
\n", + "

10 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " AMT_INCOME_TOTAL AGE WORKYEAR CODE_GENDER_F CODE_GENDER_M \\\n", + "0 0.723608 1.244158 0.193361 1.0 0.0 \n", + "1 -0.783777 1.660495 0.811816 0.0 1.0 \n", + "2 -0.783777 1.660495 0.811816 0.0 1.0 \n", + "3 -0.654573 -0.629355 -0.425093 1.0 0.0 \n", + "4 0.723608 -0.629355 -0.579707 0.0 1.0 \n", + "5 -0.654573 -0.629355 -0.425093 1.0 0.0 \n", + "6 0.508267 0.619654 -0.579707 1.0 0.0 \n", + "7 0.508267 0.619654 -0.579707 1.0 0.0 \n", + "8 0.982017 0.203318 0.193361 1.0 0.0 \n", + "9 0.982017 0.203318 0.193361 1.0 0.0 \n", + "\n", + " NAME_INCOME_TYPE_COMMERCIAL_ASSOCIATE NAME_INCOME_TYPE_PENSIONER \\\n", + "0 1.0 0.0 \n", + "1 1.0 0.0 \n", + "2 1.0 0.0 \n", + "3 1.0 0.0 \n", + "4 0.0 0.0 \n", + "5 1.0 0.0 \n", + "6 1.0 0.0 \n", + "7 1.0 0.0 \n", + "8 1.0 0.0 \n", + "9 1.0 0.0 \n", + "\n", + " NAME_INCOME_TYPE_STATE_SERVANT NAME_INCOME_TYPE_STUDENT \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "6 0.0 0.0 \n", + "7 0.0 0.0 \n", + "8 0.0 0.0 \n", + "9 0.0 0.0 \n", + "\n", + " NAME_INCOME_TYPE_WORKING ... NAME_HOUSING_TYPE_CO_OP_APARTMENT \\\n", + "0 0.0 ... 0.0 \n", + "1 0.0 ... 0.0 \n", + "2 0.0 ... 0.0 \n", + "3 0.0 ... 0.0 \n", + "4 1.0 ... 0.0 \n", + "5 0.0 ... 0.0 \n", + "6 0.0 ... 0.0 \n", + "7 0.0 ... 0.0 \n", + "8 0.0 ... 0.0 \n", + "9 0.0 ... 0.0 \n", + "\n", + " NAME_HOUSING_TYPE_HOUSE_APARTMENT NAME_HOUSING_TYPE_MUNICIPAL_APARTMENT \\\n", + "0 1.0 0.0 \n", + "1 1.0 0.0 \n", + "2 1.0 0.0 \n", + "3 1.0 0.0 \n", + "4 1.0 0.0 \n", + "5 1.0 0.0 \n", + "6 0.0 0.0 \n", + "7 0.0 0.0 \n", + "8 0.0 0.0 \n", + "9 0.0 0.0 \n", + "\n", + " NAME_HOUSING_TYPE_OFFICE_APARTMENT NAME_HOUSING_TYPE_RENTED_APARTMENT \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "5 0.0 0.0 \n", + "6 0.0 1.0 \n", + "7 0.0 1.0 \n", + "8 0.0 1.0 \n", + "9 0.0 1.0 \n", + "\n", + " NAME_HOUSING_TYPE_WITH_PARENTS FLAG_OWN_CAR FLAG_OWN_REALTY ID \\\n", + "0 0.0 0.0 1.0 5008810 \n", + "1 0.0 1.0 1.0 5008844 \n", + "2 0.0 1.0 1.0 5008849 \n", + "3 0.0 0.0 1.0 5008873 \n", + "4 0.0 1.0 1.0 5008837 \n", + "5 0.0 0.0 1.0 5008880 \n", + "6 0.0 0.0 1.0 5008888 \n", + "7 0.0 0.0 1.0 5008889 \n", + "8 0.0 0.0 1.0 5008909 \n", + "9 0.0 0.0 1.0 5008915 \n", + "\n", + " TARGET \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "5 0 \n", + "6 0 \n", + "7 0 \n", + "8 0 \n", + "9 0 \n", + "\n", + "[10 rows x 40 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "application_record_sdf.limit(10).to_pandas()" + ] + }, + { "cell_type": "markdown", "id": "37522d7a", "metadata": {}, @@ -705,7 +1776,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "6d490980", "metadata": {}, "outputs": [], @@ -714,7 +1785,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "eb7ad64f", "metadata": {}, @@ -724,7 +1794,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "70485380", "metadata": {}, @@ -734,17 +1803,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "09156afe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|1 |503 |\n", + "|0 |29819 |\n", + "----------------------\n", + "\n" + ] + } + ], "source": [ "# We have highly imbalanced data\n", "application_record_sdf.group_by('TARGET').count().show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "45620b74", "metadata": {}, @@ -756,10 +1838,134 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "21de1b91", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_onnamedatabase_nameschema_nameurlhas_credentialshas_encryption_keyownercommentregiontypecloudnotification_channelstorage_integrationowner_role_type
02023-08-16 10:35:54.364000-07:00ML_PROCSHOL_DBPUBLICNNACCOUNTADMINNoneINTERNALNoneNoneNoneROLE
12023-08-16 10:35:37.919000-07:00nnycqaafpyHOL_DBPUBLICNNACCOUNTADMINNoneINTERNAL TEMPORARYNoneNoneNoneROLE
22023-08-16 10:35:31.311000-07:00ogozsusnsxHOL_DBPUBLICNNACCOUNTADMINNoneINTERNAL TEMPORARYNoneNoneNoneROLE
\n", + "
" + ], + "text/plain": [ + " created_on name database_name schema_name url \\\n", + "0 2023-08-16 10:35:54.364000-07:00 ML_PROCS HOL_DB PUBLIC \n", + "1 2023-08-16 10:35:37.919000-07:00 nnycqaafpy HOL_DB PUBLIC \n", + "2 2023-08-16 10:35:31.311000-07:00 ogozsusnsx HOL_DB PUBLIC \n", + "\n", + " has_credentials has_encryption_key owner comment region \\\n", + "0 N N ACCOUNTADMIN None \n", + "1 N N ACCOUNTADMIN None \n", + "2 N N ACCOUNTADMIN None \n", + "\n", + " type cloud notification_channel storage_integration \\\n", + "0 INTERNAL None None None \n", + "1 INTERNAL TEMPORARY None None None \n", + "2 INTERNAL TEMPORARY None None None \n", + "\n", + " owner_role_type \n", + "0 ROLE \n", + "1 ROLE \n", + "2 ROLE " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "session.sql('CREATE OR REPLACE STAGE ML_PROCS').collect()\n", "pd.DataFrame(session.sql('SHOW STAGES').collect())" @@ -767,7 +1973,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "9b84d879", "metadata": {}, "outputs": [], @@ -802,7 +2008,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b69672f5", "metadata": {}, @@ -812,7 +2017,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "8e9c5743", "metadata": {}, "outputs": [], @@ -829,10 +2034,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "6a764741", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'\"Successfully oversampled\"'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "training_table = 'CREDIT_RISK_PREPARED'\n", "# get feature columns\n", @@ -852,10 +2068,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "id": "dee18f0c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|0 |29819 |\n", + "|1 |29819 |\n", + "----------------------\n", + "\n" + ] + } + ], "source": [ "# Now our training data is balanced\n", "train_data_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED')\n", @@ -864,13 +2094,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "02c1803e", "metadata": {}, "outputs": [], "source": [ "session.close()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19ba2471", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -889,7 +2127,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/hol/2_1_DEMO_model_building_scoring.ipynb b/hol/2_1_DEMO_model_building_scoring.ipynb index e2cd7d6..019600b 100644 --- a/hol/2_1_DEMO_model_building_scoring.ipynb +++ b/hol/2_1_DEMO_model_building_scoring.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "2e339e77", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "73ac09a3", "metadata": {}, @@ -19,7 +17,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a7d72538", "metadata": {}, @@ -29,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "2cb04fd9", "metadata": {}, "outputs": [], @@ -47,7 +44,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "22a0aaa1", "metadata": {}, @@ -57,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "e3b512a8", "metadata": {}, "outputs": [], @@ -67,7 +63,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "11413743", "metadata": {}, @@ -77,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "381310d7", "metadata": {}, "outputs": [], @@ -88,10 +83,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "9cc080c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Database and schema: \"HOL_DB\".\"PUBLIC\"\n", + "Current Warehouse: \"HOL_WH\"\n" + ] + } + ], "source": [ "session = Session.builder.configs(connection_parameters).create()\n", "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", @@ -99,7 +103,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2a42a5d2", "metadata": {}, @@ -109,26 +112,134 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "d264e18c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status='Stage area ML_MODELS successfully created.')]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "session.sql('CREATE OR REPLACE STAGE ML_MODELS').collect()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "7c293f32", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_onnamedatabase_nameschema_nameurlhas_credentialshas_encryption_keyownercommentregiontypecloudnotification_channelstorage_integrationowner_role_type
02023-08-16 10:36:55.179000-07:00ML_MODELSHOL_DBPUBLICNNACCOUNTADMINNoneINTERNALNoneNoneNoneROLE
12023-08-16 10:35:54.364000-07:00ML_PROCSHOL_DBPUBLICNNACCOUNTADMINNoneINTERNALNoneNoneNoneROLE
\n", + "
" + ], + "text/plain": [ + " created_on name database_name schema_name url \\\n", + "0 2023-08-16 10:36:55.179000-07:00 ML_MODELS HOL_DB PUBLIC \n", + "1 2023-08-16 10:35:54.364000-07:00 ML_PROCS HOL_DB PUBLIC \n", + "\n", + " has_credentials has_encryption_key owner comment region type \\\n", + "0 N N ACCOUNTADMIN None INTERNAL \n", + "1 N N ACCOUNTADMIN None INTERNAL \n", + "\n", + " cloud notification_channel storage_integration owner_role_type \n", + "0 None None None ROLE \n", + "1 None None None ROLE " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pd.DataFrame(session.sql('SHOW STAGES').collect())" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "24d093ad", "metadata": {}, @@ -140,17 +251,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "570a3776", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|0 |29819 |\n", + "|1 |29819 |\n", + "----------------------\n", + "\n" + ] + } + ], "source": [ "application_record_balanced_sdf = session.table('CREDIT_RISK_PREPARED_BALANCED')\n", "application_record_balanced_sdf.group_by('TARGET').count().show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "dd19116e", "metadata": {}, @@ -160,10 +284,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "d91ebf56", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|1 |23960 |\n", + "|0 |23819 |\n", + "----------------------\n", + "\n" + ] + } + ], "source": [ "train_sdf = application_record_balanced_sdf.sample_by(\"TARGET\", {1: 0.8, 0: 0.8})\n", "train_sdf = train_sdf.cache_result()\n", @@ -171,7 +309,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "ca734c35", "metadata": {}, @@ -181,17 +318,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "0297ff43", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------\n", + "|\"TARGET\" |\"COUNT\" |\n", + "----------------------\n", + "|0 |6000 |\n", + "|1 |4886 |\n", + "----------------------\n", + "\n" + ] + } + ], "source": [ "test_sdf = application_record_balanced_sdf.minus(train_sdf)\n", "test_sdf.group_by('TARGET').count().show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c3fe042a", "metadata": {}, @@ -201,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "f455553a", "metadata": {}, "outputs": [], @@ -214,7 +364,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a4083f48", "metadata": {}, @@ -224,10 +373,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "6983af6f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "feature_cols = train_sdf.columns\n", "feature_cols.remove('TARGET')\n", @@ -239,7 +399,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b9e4c195", "metadata": {}, @@ -249,17 +408,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "5e951c2f", "metadata": {}, - "outputs": [], - "source": [ - "feature_coefficients = pd.DataFrame(lm.get_sklearn_object().coef_.T,lm.get_sklearn_object().feature_names_in_,columns=['Coefficient'])\n", + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "feature_coefficients = pd.DataFrame(lm.to_sklearn().coef_.T,lm.to_sklearn().feature_names_in_,columns=['Coefficient'])\n", "feature_coefficients.sort_values('Coefficient').plot.barh(y='Coefficient', figsize=(5,15))" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8097c837", "metadata": {}, @@ -269,17 +448,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "b1fbbbaf", "metadata": {}, - "outputs": [], - "source": [ - "skl_ml = lm.get_sklearn_object()\n", + "outputs": [ + { + "data": { + "text/plain": [ + "sklearn.linear_model._logistic.LogisticRegression" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "skl_ml = lm.to_sklearn()\n", "type(skl_ml)" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "19a60347", "metadata": {}, @@ -288,7 +477,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "95c5ff7a", "metadata": {}, @@ -297,7 +485,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "acbb5608", "metadata": {}, @@ -307,7 +494,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "c28783e5", "metadata": {}, "outputs": [], @@ -318,7 +505,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "98cac9aa", "metadata": {}, @@ -327,7 +513,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "e90db3e5", "metadata": {}, @@ -337,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "5381ced6", "metadata": {}, "outputs": [], @@ -347,17 +532,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "5990a3f3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------\n", + "|\"TARGET\" |\"CAST('0.0' AS FLOAT)\" |\"CAST('1.0' AS FLOAT)\" |\n", + "--------------------------------------------------------------\n", + "|1 |2132 |2754 |\n", + "|0 |3708 |2292 |\n", + "--------------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "# Obtaining a simple confusion matrix\n", "scored_sdf.crosstab('TARGET','PREDICTION').show()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "34e1289f", "metadata": {}, @@ -367,7 +565,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "a21ddd7f", "metadata": {}, "outputs": [], @@ -389,23 +587,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "16087bac", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------\n", + "|\"ACCURACY\" |\"PRECISION\" |\"RECALL\" |\"F1\" |\n", + "--------------------------------------------------------\n", + "|0.593606 |0.545779 |0.563651 |0.554571048428 |\n", + "--------------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "calc_metrics(scored_sdf)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "f246388b", "metadata": {}, "outputs": [], "source": [ "session.close()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7973a12", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -424,7 +643,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/hol/3_1_SOLUTION_additional_models_xgboost.ipynb b/hol/3_1_SOLUTION_additional_models_xgboost.ipynb index 5b8b924..9f039ee 100644 --- a/hol/3_1_SOLUTION_additional_models_xgboost.ipynb +++ b/hol/3_1_SOLUTION_additional_models_xgboost.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "2e339e77", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "73ac09a3", "metadata": {}, @@ -19,7 +17,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a7d72538", "metadata": {}, @@ -29,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "2cb04fd9", "metadata": {}, "outputs": [], @@ -44,7 +41,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1f47f8e9", "metadata": {}, @@ -54,10 +50,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "f82010d6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'snowflake-snowpark-python': '1.5.1',\n", + " 'pandas': '1.5.3',\n", + " 'scikit-learn': '1.2.2',\n", + " 'lightgbm': '3.3.5',\n", + " 'xgboost': '1.7.3',\n", + " 'joblib': '1.2.0',\n", + " 'imbalanced-learn': '0.10.1'}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "with open('packages_version.json') as f:\n", " packages_version = json.load(f)\n", @@ -66,7 +79,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "11413743", "metadata": {}, @@ -76,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "25a54f72", "metadata": {}, "outputs": [], @@ -87,10 +99,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "9cc080c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Database and schema: \"HOL_DB\".\"PUBLIC\"\n", + "Current Warehouse: \"HOL_WH\"\n" + ] + } + ], "source": [ "session = Session.builder.configs(connection_parameters).create()\n", "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", @@ -98,7 +119,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "24d093ad", "metadata": {}, @@ -108,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "4df6c514", "metadata": {}, "outputs": [], @@ -118,7 +138,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "74d762b8", "metadata": {}, @@ -128,10 +147,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "0053ffc1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "SnowparkSQLException", + "evalue": "(1304): 01ae54b3-0001-b953-002c-cb070008c562: 002003 (42S02): SQL compilation error:\nObject 'CREDIT_RISK_PREPARED_BALANCED_TRAIN' does not exist or not authorized.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mSnowparkSQLException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m feature_cols \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_sdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\n\u001b[1;32m 2\u001b[0m feature_cols\u001b[38;5;241m.\u001b[39mremove(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTARGET\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m feature_cols\u001b[38;5;241m.\u001b[39mremove(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mID\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/dataframe.py:931\u001b[0m, in \u001b[0;36mDataFrame.columns\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 916\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcolumns\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[1;32m 917\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns all column names as a list.\u001b[39;00m\n\u001b[1;32m 918\u001b[0m \n\u001b[1;32m 919\u001b[0m \u001b[38;5;124;03m The returned column names are consistent with the Snowflake database object `identifier syntax `_.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 929\u001b[0m \u001b[38;5;124;03m ================================== ==========================\u001b[39;00m\n\u001b[1;32m 930\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 931\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschema\u001b[49m\u001b[38;5;241m.\u001b[39mnames\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/functools.py:967\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[0;34m(self, instance, owner)\u001b[0m\n\u001b[1;32m 965\u001b[0m val \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname, _NOT_FOUND)\n\u001b[1;32m 966\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m val \u001b[38;5;129;01mis\u001b[39;00m _NOT_FOUND:\n\u001b[0;32m--> 967\u001b[0m val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 969\u001b[0m cache[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname] \u001b[38;5;241m=\u001b[39m val\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/dataframe.py:3638\u001b[0m, in \u001b[0;36mDataFrame.schema\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 3633\u001b[0m \u001b[38;5;129m@cached_property\u001b[39m\n\u001b[1;32m 3634\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mschema\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m StructType:\n\u001b[1;32m 3635\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"The definition of the columns in this DataFrame (the \"relational schema\" for\u001b[39;00m\n\u001b[1;32m 3636\u001b[0m \u001b[38;5;124;03m the DataFrame).\u001b[39;00m\n\u001b[1;32m 3637\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 3638\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m StructType\u001b[38;5;241m.\u001b[39m_from_attributes(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_plan\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattributes\u001b[49m)\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/functools.py:967\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[0;34m(self, instance, owner)\u001b[0m\n\u001b[1;32m 965\u001b[0m val \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname, _NOT_FOUND)\n\u001b[1;32m 966\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m val \u001b[38;5;129;01mis\u001b[39;00m _NOT_FOUND:\n\u001b[0;32m--> 967\u001b[0m val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 969\u001b[0m cache[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname] \u001b[38;5;241m=\u001b[39m val\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/_internal/analyzer/snowflake_plan.py:249\u001b[0m, in \u001b[0;36mSnowflakePlan.attributes\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[38;5;129m@cached_property\u001b[39m\n\u001b[1;32m 248\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mattributes\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Attribute]:\n\u001b[0;32m--> 249\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43manalyze_attributes\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mschema_query\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mschema_query \u001b[38;5;241m=\u001b[39m schema_value_statement(output)\n\u001b[1;32m 251\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/_internal/analyzer/schema_utils.py:82\u001b[0m, in \u001b[0;36manalyze_attributes\u001b[0;34m(sql, session)\u001b[0m\n\u001b[1;32m 79\u001b[0m session\u001b[38;5;241m.\u001b[39m_run_query(sql)\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m convert_result_meta_to_attribute(session\u001b[38;5;241m.\u001b[39m_conn\u001b[38;5;241m.\u001b[39m_cursor\u001b[38;5;241m.\u001b[39mdescription)\n\u001b[0;32m---> 82\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_result_attributes\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/session.py:1247\u001b[0m, in \u001b[0;36mSession._get_result_attributes\u001b[0;34m(self, query)\u001b[0m\n\u001b[1;32m 1246\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_result_attributes\u001b[39m(\u001b[38;5;28mself\u001b[39m, query: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Attribute]:\n\u001b[0;32m-> 1247\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_conn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_result_attributes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/_internal/analyzer/snowflake_plan.py:180\u001b[0m, in \u001b[0;36mSnowflakePlan.Decorator.wrap_exception..wrap\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 177\u001b[0m ne \u001b[38;5;241m=\u001b[39m SnowparkClientExceptionMessages\u001b[38;5;241m.\u001b[39mSQL_EXCEPTION_FROM_PROGRAMMING_ERROR(\n\u001b[1;32m 178\u001b[0m e\n\u001b[1;32m 179\u001b[0m )\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ne\u001b[38;5;241m.\u001b[39mwith_traceback(tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/_internal/analyzer/snowflake_plan.py:110\u001b[0m, in \u001b[0;36mSnowflakePlan.Decorator.wrap_exception..wrap\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 110\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m snowflake\u001b[38;5;241m.\u001b[39mconnector\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mProgrammingError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 112\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/snowpark/_internal/server_connection.py:206\u001b[0m, in \u001b[0;36mServerConnection.get_result_attributes\u001b[0;34m(self, query)\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[38;5;129m@SnowflakePlan\u001b[39m\u001b[38;5;241m.\u001b[39mDecorator\u001b[38;5;241m.\u001b[39mwrap_exception\n\u001b[1;32m 205\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_result_attributes\u001b[39m(\u001b[38;5;28mself\u001b[39m, query: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Attribute]:\n\u001b[0;32m--> 206\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m convert_result_meta_to_attribute(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdescribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m)\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/connector/cursor.py:926\u001b[0m, in \u001b[0;36mSnowflakeCursor.describe\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 917\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Obtain the schema of the result without executing the query.\u001b[39;00m\n\u001b[1;32m 918\u001b[0m \n\u001b[1;32m 919\u001b[0m \u001b[38;5;124;03mThis function takes the same arguments as execute, please refer to that function\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 923\u001b[0m \u001b[38;5;124;03m The schema of the result.\u001b[39;00m\n\u001b[1;32m 924\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 925\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_describe_only\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_is_internal\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 926\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 927\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_description\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/connector/cursor.py:904\u001b[0m, in \u001b[0;36mSnowflakeCursor.execute\u001b[0;34m(self, command, params, _bind_stage, timeout, _exec_async, _no_retry, _do_reset, _put_callback, _put_azure_callback, _put_callback_output_stream, _get_callback, _get_azure_callback, _get_callback_output_stream, _show_progress_bar, _statement_params, _is_internal, _describe_only, _no_results, _is_put_get, _raise_put_get_error, _force_put_overwrite, _skip_upload_on_content_match, file_stream, num_statements)\u001b[0m\n\u001b[1;32m 900\u001b[0m is_integrity_error \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 901\u001b[0m code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m100072\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 902\u001b[0m ) \u001b[38;5;66;03m# NULL result in a non-nullable column\u001b[39;00m\n\u001b[1;32m 903\u001b[0m error_class \u001b[38;5;241m=\u001b[39m IntegrityError \u001b[38;5;28;01mif\u001b[39;00m is_integrity_error \u001b[38;5;28;01melse\u001b[39;00m ProgrammingError\n\u001b[0;32m--> 904\u001b[0m \u001b[43mError\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrorhandler_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_class\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 905\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/connector/errors.py:290\u001b[0m, in \u001b[0;36mError.errorhandler_wrapper\u001b[0;34m(connection, cursor, error_class, error_value)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[38;5;129m@staticmethod\u001b[39m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21merrorhandler_wrapper\u001b[39m(\n\u001b[1;32m 269\u001b[0m connection: SnowflakeConnection \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 272\u001b[0m error_value: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Any],\n\u001b[1;32m 273\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 274\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Error handler wrapper that calls the errorhandler method.\u001b[39;00m\n\u001b[1;32m 275\u001b[0m \n\u001b[1;32m 276\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 287\u001b[0m \u001b[38;5;124;03m exception to the first handler in that order.\u001b[39;00m\n\u001b[1;32m 288\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 290\u001b[0m handed_over \u001b[38;5;241m=\u001b[39m \u001b[43mError\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhand_to_other_handler\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 291\u001b[0m \u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 292\u001b[0m \u001b[43m \u001b[49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 293\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_class\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m handed_over:\n\u001b[1;32m 297\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Error\u001b[38;5;241m.\u001b[39merrorhandler_make_exception(\n\u001b[1;32m 298\u001b[0m error_class,\n\u001b[1;32m 299\u001b[0m error_value,\n\u001b[1;32m 300\u001b[0m )\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/connector/errors.py:345\u001b[0m, in \u001b[0;36mError.hand_to_other_handler\u001b[0;34m(connection, cursor, error_class, error_value)\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cursor \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 344\u001b[0m cursor\u001b[38;5;241m.\u001b[39mmessages\u001b[38;5;241m.\u001b[39mappend((error_class, error_value))\n\u001b[0;32m--> 345\u001b[0m \u001b[43mcursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merrorhandler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcursor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_class\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m connection \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/anaconda3/envs/pysnowpark/lib/python3.8/site-packages/snowflake/connector/errors.py:221\u001b[0m, in \u001b[0;36mError.default_errorhandler\u001b[0;34m(connection, cursor, error_class, error_value)\u001b[0m\n\u001b[1;32m 219\u001b[0m errno \u001b[38;5;241m=\u001b[39m error_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merrno\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 220\u001b[0m done_format_msg \u001b[38;5;241m=\u001b[39m error_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdone_format_msg\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 221\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(\n\u001b[1;32m 222\u001b[0m msg\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmsg\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 223\u001b[0m errno\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m errno \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mint\u001b[39m(errno),\n\u001b[1;32m 224\u001b[0m sqlstate\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msqlstate\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 225\u001b[0m sfqid\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msfqid\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 226\u001b[0m query\u001b[38;5;241m=\u001b[39merror_value\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquery\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m 227\u001b[0m done_format_msg\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m done_format_msg \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mbool\u001b[39m(done_format_msg)\n\u001b[1;32m 229\u001b[0m ),\n\u001b[1;32m 230\u001b[0m connection\u001b[38;5;241m=\u001b[39mconnection,\n\u001b[1;32m 231\u001b[0m cursor\u001b[38;5;241m=\u001b[39mcursor,\n\u001b[1;32m 232\u001b[0m )\n", + "\u001b[0;31mSnowparkSQLException\u001b[0m: (1304): 01ae54b3-0001-b953-002c-cb070008c562: 002003 (42S02): SQL compilation error:\nObject 'CREDIT_RISK_PREPARED_BALANCED_TRAIN' does not exist or not authorized." + ] + } + ], "source": [ "feature_cols = train_sdf.columns\n", "feature_cols.remove('TARGET')\n", @@ -150,12 +196,11 @@ "outputs": [], "source": [ "# Plot feature importance\n", - "feat_importance = pd.DataFrame(xgbmodel.get_sklearn_object().feature_importances_,feature_cols,columns=['FeatImportance'])\n", + "feat_importance = pd.DataFrame(xgbmodel.to_xgboost().feature_importances_,feature_cols,columns=['FeatImportance'])\n", "feat_importance.sort_values('FeatImportance').plot.barh(y='FeatImportance', figsize=(5,15))" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "19a60347", "metadata": {}, @@ -164,7 +209,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "95c5ff7a", "metadata": {}, @@ -173,7 +217,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5046b342", "metadata": {}, @@ -194,6 +237,30 @@ "session.table('CREDIT_RISK_PREPARED_BALANCED_TEST_SCORED').show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "61102c2c", + "metadata": {}, + "outputs": [], + "source": [ + "import snowflake.snowpark.functions as F\n", + "\n", + "def calc_metrics(snf_df):\n", + " return snf_df.group_by(['TARGET','PREDICTION']).count()\\\n", + " .with_column(\"type\", F.when((F.col(\"TARGET\") == 0) & (F.col(\"PREDICTION\") == 0), \"tn\")\\\n", + " .when((F.col(\"TARGET\") == 0) & (F.col(\"PREDICTION\") == 1), \"fp\")\\\n", + " .when((F.col(\"TARGET\") == 1) & (F.col(\"PREDICTION\") == 0), \"fn\")\\\n", + " .when((F.col(\"TARGET\") == 1) & (F.col(\"PREDICTION\") == 1), \"tp\"))\\\n", + " .select([\"TYPE\", \"COUNT\"]).pivot(\"TYPE\", ['tn', 'tp', 'fn', 'fp']).sum(\"COUNT\")\\\n", + " .with_columns([\"accuracy\", \"precision\", \"recall\"],\n", + " [((F.col(\"'tp'\") + F.col(\"'tn'\")) / (F.col(\"'tp'\") + F.col(\"'tn'\") + F.col(\"'fn'\") + F.col(\"'fp'\")))\n", + " , (F.col(\"'tp'\") / (F.col(\"'tp'\") + F.col(\"'fp'\")))\n", + " ,(F.col(\"'tp'\") / (F.col(\"'tp'\") + F.col(\"'fn'\")))])\\\n", + " .with_column(\"f1\", (F.lit(2)*F.col(\"precision\")*F.col(\"recall\")) / (F.col(\"precision\")+F.col(\"recall\")))\\\n", + " .select([\"ACCURACY\",\"PRECISION\", \"RECALL\",\"F1\"]).show()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -221,7 +288,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/hol/3_2_SOLUTION_additional_models_lightgbm.ipynb b/hol/3_2_SOLUTION_additional_models_lightgbm.ipynb index 8b27192..4cbd527 100644 --- a/hol/3_2_SOLUTION_additional_models_lightgbm.ipynb +++ b/hol/3_2_SOLUTION_additional_models_lightgbm.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "2e339e77", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "73ac09a3", "metadata": {}, @@ -19,7 +17,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a7d72538", "metadata": {}, @@ -37,6 +34,7 @@ "from snowflake.snowpark.session import Session\n", "import snowflake.snowpark.types as T\n", "\n", + "import joblib\n", "from snowflake.ml.modeling.lightgbm import LGBMClassifier\n", "\n", "import json\n", @@ -44,7 +42,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c4442706", "metadata": {}, @@ -64,7 +61,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "11413743", "metadata": {}, @@ -96,7 +92,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "24d093ad", "metadata": {}, @@ -116,7 +111,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b36c435c", "metadata": {}, @@ -124,36 +118,152 @@ "# SOLUTION: Train a LightGBM Model on a Snowpark DataFrame using snowpark-ml" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "91221b05", + "metadata": {}, + "outputs": [], + "source": [ + "# This local Python-function will be registered as a Stored Procedure and runs in Snowflake\n", + "sk_version = \"1.2.2\"\n", + "snowpark_version = \"1.3.0\"\n", + "pandas_version = \"1.5.3\"\n", + "lightgbm_version = \"3.3.5\"\n" + ] + }, { "cell_type": "code", "execution_count": null, "id": "584b35e7", "metadata": {}, "outputs": [], + "source": [ + "def sproc_train_lightgbm_model(session: Session, \n", + " training_table: str, \n", + " feature_cols: list,\n", + " target_col: str,\n", + " model_name: str) -> T.Variant:\n", + " \n", + " # WORKFLOW\n", + " # 1: Load data into Pandas DataFrame\n", + " # 2: Define features and Label\n", + " # 3: Train the model\n", + " # 4: (Optional) Return feature importance\n", + " # 5: Save the model and upload to Snowflake Stage\n", + " # 6: Return feature importance or success-message\n", + " # Hint: Make sure return is json-compatible (e.g. via calling to_dict())\n", + "\n", + " # 1: Load data into Pandas DataFrame\n", + " local_training_data = training_table.to_pandas()\n", + "\n", + " # 2: Define features and Label\n", + " X = local_training_data[feature_cols]\n", + " y = local_training_data[target_col]\n", + " \n", + " # 3: Train the model\n", + " lgbmodel = LGBMClassifier(input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')\n", + " lgbmodel.fit(training_table)\n", + " \n", + " # 4: (Optional) Return feature importance\n", + " feat_importance = pd.DataFrame(lgbmodel.to_lightgbm().feature_importances_,feature_cols,columns=['FeatImportance'])\n", + "\n", + " # 5: Save the model and upload to Snowflake Stage\n", + " joblib.dump(lgbmodel, \"/tmp/\" + model_name)\n", + " session.file.put(\"/tmp/\" + model_name, \"@MODEL_STAGE\", auto_compress=False, overwrite=True)\n", + " \n", + " # 6: Return feature importance or success-message\n", + " feat_importance.sort_values('FeatImportance').plot.barh(y='FeatImportance', figsize=(5,15))\n", + "\n", + "\n", + " return feat_importance" + ] + }, + { + "cell_type": "markdown", + "id": "e38d9398", + "metadata": {}, + "source": [ + "# SOLUTION: Register your Stored Procedure to train an LightGBM Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27de1ca9", + "metadata": {}, + "outputs": [], + "source": [ + "session.sql(\"CREATE STAGE IF NOT EXISTS model_stage\").collect()\n", + "pd.DataFrame(session.sql(\"LIST @MODEL_STAGE\").collect())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d578865", + "metadata": {}, + "outputs": [], + "source": [ + "session.add_packages(\"snowflake-snowpark-python==1.5.1\",\n", + " \"pandas==1.5.3\", \"scikit-learn==1.2.2\", \"lightgbm==3.3.5\",\n", + " \"xgboost==1.7.3\", \"joblib==1.2.0\", \"imbalanced-learn==0.10.1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03b45ce7", + "metadata": {}, + "outputs": [], + "source": [ + "# Registering the function as a Stored Procedure\n", + "sproc_train_lightgbm_model = session.sproc.register(func=sproc_train_lightgbm_model, \n", + " name='sproc_train_lgbm_model', \n", + " is_permanent=True, \n", + " replace=True, \n", + " stage_location='MODEL_STAGE')" + ] + }, + { + "cell_type": "markdown", + "id": "979b1b1c", + "metadata": {}, + "source": [ + "# SOLUTION: Run your Stored Procedure to train an LightGBM Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01b72e16", + "metadata": {}, + "outputs": [], "source": [ "feature_cols = train_sdf.columns\n", "feature_cols.remove('TARGET')\n", "feature_cols.remove('ID')\n", - "target_col = 'TARGET'\n", "\n", - "lgbmodel = LGBMClassifier(input_cols=feature_cols, label_cols=target_col, output_cols='PREDICTION')\n", - "lgbmodel.fit(train_sdf)" + "result = sproc_train_lightgbm_model(training_table=train_sdf, \n", + " feature_cols=feature_cols, \n", + " target_col='TARGET',\n", + " model_name=\"train_lgbm_model\", \n", + " session=session)\n", + "result" ] }, { "cell_type": "code", "execution_count": null, - "id": "bedab745", + "id": "c1239449", "metadata": {}, "outputs": [], "source": [ - "# Plot feature importance\n", - "feat_importance = pd.DataFrame(lgbmodel.get_sklearn_object().feature_importances_,feature_cols,columns=['FeatImportance'])\n", - "feat_importance.sort_values('FeatImportance').plot.barh(y='FeatImportance', figsize=(5,15))" + "# The model is now stored in a Snowflake stage\n", + "pd.DataFrame(session.sql('LIST @MODEL_STAGE').collect()) " ] }, { - "attachments": {}, "cell_type": "markdown", "id": "19a60347", "metadata": {}, @@ -162,7 +272,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "95c5ff7a", "metadata": {}, @@ -171,12 +280,27 @@ ] }, { - "attachments": {}, + "cell_type": "markdown", + "id": "64569d68", + "metadata": {}, + "source": [ + "# SOLUTION: Define your UDF to Score an LightGBM Model" + ] + }, + { "cell_type": "markdown", "id": "3f2baa9c", "metadata": {}, "source": [ - "# SOLUTION: Use the fitted LightGBM Model to score a Snowpark DataFrame" + "# SOLUTION: Register your UDF to Score an LightGBM Model" + ] + }, + { + "cell_type": "markdown", + "id": "b9ecebe0", + "metadata": {}, + "source": [ + "# SOLUTION: Run your UDF to Score an LightGBM Model" ] }, { @@ -218,7 +342,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" } }, "nbformat": 4, diff --git a/hol/4_1_DEMO_hyperparameter_tuning_gridsearch.ipynb b/hol/4_1_DEMO_hyperparameter_tuning_gridsearch.ipynb index ad7c5fe..6cae8a0 100644 --- a/hol/4_1_DEMO_hyperparameter_tuning_gridsearch.ipynb +++ b/hol/4_1_DEMO_hyperparameter_tuning_gridsearch.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "2e339e77", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "73ac09a3", "metadata": {}, @@ -19,7 +17,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a7d72538", "metadata": {}, @@ -29,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "2cb04fd9", "metadata": {}, "outputs": [], @@ -48,7 +45,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "91955f46", "metadata": {}, @@ -58,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e640693e", "metadata": {}, "outputs": [], @@ -68,7 +64,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "11413743", "metadata": {}, @@ -78,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "04bad29b", "metadata": {}, "outputs": [], @@ -89,10 +84,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "9cc080c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Database and schema: \"HOL_DB\".\"PUBLIC\"\n", + "Current Warehouse: \"HOL_WH\"\n" + ] + } + ], "source": [ "session = Session.builder.configs(connection_parameters).create()\n", "print(f\"Current Database and schema: {session.get_fully_qualified_current_schema()}\")\n", @@ -100,7 +104,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "24d093ad", "metadata": {}, @@ -110,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "4df6c514", "metadata": {}, "outputs": [], @@ -121,10 +124,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "442a5131", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "feature_cols = train_sdf.columns\n", "feature_cols.remove('TARGET')\n", @@ -148,7 +162,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "55c233ba", "metadata": {}, @@ -158,13 +171,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "46794ab5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Analyze grid search results\n", - "gs_results = grid_search.get_sklearn_object().cv_results_\n", + "gs_results = grid_search.to_sklearn().cv_results_\n", "n_estimators_val = []\n", "max_depth_val = []\n", "for param_dict in gs_results[\"params\"]:\n", @@ -178,6 +202,14 @@ "sns.relplot(data=gs_results_df, x=\"max_depth\", y=\"f1_val\", hue=\"n_estimators\", kind='line')\n", "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db5c6afd", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -196,7 +228,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" } }, "nbformat": 4,