From 4851394fd6248f20b664d54e7eba7d1762e87508 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 3 Nov 2023 15:43:17 +0100 Subject: [PATCH] ENH Add Adult census dataset description (#747) --- jupyter-book/_toc.yml | 2 +- .../appendix/adult_census_description.md | 11 - notebooks/datasets_adult_census.ipynb | 203 ++++++++++++++++++ python_scripts/datasets_adult_census.py | 160 ++++++++++++++ python_scripts/linear_models_ex_03.py | 2 +- python_scripts/linear_models_sol_03.py | 2 +- 6 files changed, 366 insertions(+), 14 deletions(-) delete mode 100644 jupyter-book/appendix/adult_census_description.md create mode 100644 notebooks/datasets_adult_census.ipynb create mode 100644 python_scripts/datasets_adult_census.py diff --git a/jupyter-book/_toc.yml b/jupyter-book/_toc.yml index 01d356e74..1907f35a5 100644 --- a/jupyter-book/_toc.yml +++ b/jupyter-book/_toc.yml @@ -213,7 +213,7 @@ parts: - file: appendix/datasets_intro sections: - file: python_scripts/trees_dataset - - file: appendix/adult_census_description + - file: python_scripts/datasets_adult_census - file: python_scripts/datasets_california_housing - file: python_scripts/datasets_ames_housing - file: python_scripts/datasets_blood_transfusion diff --git a/jupyter-book/appendix/adult_census_description.md b/jupyter-book/appendix/adult_census_description.md deleted file mode 100644 index d7775cf7d..000000000 --- a/jupyter-book/appendix/adult_census_description.md +++ /dev/null @@ -1,11 +0,0 @@ -# The adult census dataset - -This dataset is a collection of information related to a person. The prediction -task is to predict whether a person is earning a salary above or below 50 k$. - -We extensively exploring this dataset in the first module "The predictive -modeling pipeline", in the first sequence "Tabular data exploration", in the -first notebook "First look at our dataset". - -To avoid repeating the same information, we redirect the reader to this -particular notebook. diff --git a/notebooks/datasets_adult_census.ipynb b/notebooks/datasets_adult_census.ipynb new file mode 100644 index 000000000..371525cc2 --- /dev/null +++ b/notebooks/datasets_adult_census.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Adult census dataset\n", + "\n", + "[This dataset](http://www.openml.org/d/1590) is a collection of demographic\n", + "information for the adult population as of 1994 in the USA. The prediction\n", + "task is to predict whether a person is earning a high or low revenue in\n", + "USD/year.\n", + "\n", + "The column named **class** is the target variable (i.e., the variable which we\n", + "want to predict). The two possible classes are `\" <=50K\"` (low-revenue) and\n", + "`\" >50K\"` (high-revenue).\n", + "\n", + "Before drawing any conclusions based on its statistics or the predictions of\n", + "models trained on it, remember that this dataset is not only outdated, but is\n", + "also not representative of the US population. In fact, the original data\n", + "contains a feature named `fnlwgt` that encodes the number of units in the\n", + "target population that the responding unit represents.\n", + "\n", + "First we load the dataset. We keep only some columns of interest to ease the\n", + "plotting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "adult_census = pd.read_csv(\"../datasets/adult-census.csv\")\n", + "columns_to_plot = [\n", + " \"age\",\n", + " \"education-num\",\n", + " \"capital-loss\",\n", + " \"capital-gain\",\n", + " \"hours-per-week\",\n", + " \"relationship\",\n", + " \"class\",\n", + "]\n", + "target_name = \"class\"\n", + "target = adult_census[target_name]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We explore this dataset in the first module's notebook \"First look at our\n", + "dataset\", where we provide a first intuition on how the data is structured.\n", + "There, we use a seaborn pairplot to visualize pairwise relationships between\n", + "the numerical variables in the dataset. This tool aligns scatter plots for every pair\n", + "of variables and histograms for the plots in the\n", + "diagonal of the array.\n", + "\n", + "This approach is limited:\n", + "- Pair plots can only deal with numerical features and;\n", + "- by observing pairwise interactions we end up with a two-dimensional\n", + " projection of a multi-dimensional feature space, which can lead to a wrong\n", + " interpretation of the individual impact of a feature.\n", + "\n", + "Here we explore with some more detail the relation between features using\n", + "plotly `Parcoords`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.graph_objects as go\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "le = LabelEncoder()\n", + "\n", + "\n", + "def generate_dict(col):\n", + " \"\"\"Check if column is categorical and generate the appropriate dict\"\"\"\n", + " if adult_census[col].dtype == \"object\": # Categorical column\n", + " encoded = le.fit_transform(adult_census[col])\n", + " return {\n", + " \"tickvals\": list(range(len(le.classes_))),\n", + " \"ticktext\": list(le.classes_),\n", + " \"label\": col,\n", + " \"values\": encoded,\n", + " }\n", + " else: # Numerical column\n", + " return {\"label\": col, \"values\": adult_census[col]}\n", + "\n", + "\n", + "plot_list = [generate_dict(col) for col in columns_to_plot]\n", + "\n", + "fig = go.Figure(\n", + " data=go.Parcoords(\n", + " line=dict(\n", + " color=le.fit_transform(target),\n", + " colorscale=\"Viridis\",\n", + " ),\n", + " dimensions=plot_list,\n", + " )\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Parcoords` plot is quite similar to the parallel coordinates plot that we\n", + "present in the module on hyperparameters tuning in this mooc. It display the\n", + "values of the features on different columns while the target class is color\n", + "coded. Thus, we are able to quickly inspect if there is a range of values for\n", + "a certain feature which is leading to a particular result.\n", + "\n", + "As in the parallel coordinates plot, it is possible to select one or more\n", + "ranges of values by clicking and holding on any axis of the plot. You can then\n", + "slide (move) the range selection and cross two selections to see the\n", + "intersections. You can undo a selection by clicking once again on the same\n", + "axis.\n", + "\n", + "In particular for this dataset we observe that values of `\"age\"` lower to 20\n", + "years are quite predictive of low-income, regardless of the value of other\n", + "features. Similarly, a `\"capital-loss\"` above `4000` seems to lead to\n", + "low-income.\n", + "\n", + "Even if it is beyond the scope of the present MOOC, one can additionally\n", + "explore correlations between features, for example, using Spearman's rank\n", + "correlation, as the more popular Pearson's correlation is only appropriate for\n", + "continuous data that is normally distributed and linearly related. Spearman's\n", + "correlation is more versatile in dealing with non-linear relationships and\n", + "ordinal data, but it is not meant for nominal categorical data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "from scipy.cluster import hierarchy\n", + "from scipy.spatial.distance import squareform\n", + "from scipy.stats import spearmanr\n", + "\n", + "# Keep numerical features only\n", + "X = adult_census[columns_to_plot].drop(columns=[\"class\", \"relationship\"])\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))\n", + "corr = spearmanr(X).correlation\n", + "\n", + "# Ensure the correlation matrix is symmetric\n", + "corr = (corr + corr.T) / 2\n", + "np.fill_diagonal(corr, 1)\n", + "\n", + "# We convert the correlation matrix to a distance matrix before performing\n", + "# hierarchical clustering using Ward's linkage.\n", + "distance_matrix = 1 - np.abs(corr)\n", + "dist_linkage = hierarchy.ward(squareform(distance_matrix))\n", + "dendro = hierarchy.dendrogram(\n", + " dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90\n", + ")\n", + "dendro_idx = np.arange(0, len(dendro[\"ivl\"]))\n", + "\n", + "ax2.imshow(corr[dendro[\"leaves\"], :][:, dendro[\"leaves\"]], cmap=\"coolwarm\")\n", + "ax2.set_xticks(dendro_idx)\n", + "ax2.set_yticks(dendro_idx)\n", + "ax2.set_xticklabels(dendro[\"ivl\"], rotation=\"vertical\")\n", + "ax2.set_yticklabels(dendro[\"ivl\"])\n", + "_ = fig.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using a [diverging\n", + "colormap](https://matplotlib.org/stable/users/explain/colors/colormaps.html#diverging)\n", + "such as \"coolwarm\", the softer the color, the less (anti)correlation between\n", + "features (no correlation is mapped to white color). In this case dark blue\n", + "represents strong negative correlations and dark red means strong positive\n", + "correlations. Indeed, any feature is perfectly correlated to itself." + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/python_scripts/datasets_adult_census.py b/python_scripts/datasets_adult_census.py new file mode 100644 index 000000000..f86bf40ef --- /dev/null +++ b/python_scripts/datasets_adult_census.py @@ -0,0 +1,160 @@ +# --- +# jupyter: +# kernelspec: +# display_name: Python 3 +# name: python3 +# --- + +# %% [markdown] +# # The adult census dataset +# +# [This dataset](http://www.openml.org/d/1590) is a collection of demographic +# information for the adult population as of 1994 in the USA. The prediction +# task is to predict whether a person is earning a high or low revenue in +# USD/year. +# +# The column named **class** is the target variable (i.e., the variable which we +# want to predict). The two possible classes are `" <=50K"` (low-revenue) and +# `" >50K"` (high-revenue). +# +# Before drawing any conclusions based on its statistics or the predictions of +# models trained on it, remember that this dataset is not only outdated, but is +# also not representative of the US population. In fact, the original data +# contains a feature named `fnlwgt` that encodes the number of units in the +# target population that the responding unit represents. +# +# First we load the dataset. We keep only some columns of interest to ease the +# plotting. + +# %% +import pandas as pd + +adult_census = pd.read_csv("../datasets/adult-census.csv") +columns_to_plot = [ + "age", + "education-num", + "capital-loss", + "capital-gain", + "hours-per-week", + "relationship", + "class", +] +target_name = "class" +target = adult_census[target_name] + +# %% [markdown] +# We explore this dataset in the first module's notebook "First look at our +# dataset", where we provide a first intuition on how the data is structured. +# There, we use a seaborn pairplot to visualize pairwise relationships between +# the numerical variables in the dataset. This tool aligns scatter plots for every pair +# of variables and histograms for the plots in the +# diagonal of the array. +# +# This approach is limited: +# - Pair plots can only deal with numerical features and; +# - by observing pairwise interactions we end up with a two-dimensional +# projection of a multi-dimensional feature space, which can lead to a wrong +# interpretation of the individual impact of a feature. +# +# Here we explore with some more detail the relation between features using +# plotly `Parcoords`. + +# %% +import plotly.graph_objects as go +from sklearn.preprocessing import LabelEncoder + +le = LabelEncoder() + + +def generate_dict(col): + """Check if column is categorical and generate the appropriate dict""" + if adult_census[col].dtype == "object": # Categorical column + encoded = le.fit_transform(adult_census[col]) + return { + "tickvals": list(range(len(le.classes_))), + "ticktext": list(le.classes_), + "label": col, + "values": encoded, + } + else: # Numerical column + return {"label": col, "values": adult_census[col]} + + +plot_list = [generate_dict(col) for col in columns_to_plot] + +fig = go.Figure( + data=go.Parcoords( + line=dict( + color=le.fit_transform(target), + colorscale="Viridis", + ), + dimensions=plot_list, + ) +) +fig.show() + +# %% [markdown] +# The `Parcoords` plot is quite similar to the parallel coordinates plot that we +# present in the module on hyperparameters tuning in this mooc. It display the +# values of the features on different columns while the target class is color +# coded. Thus, we are able to quickly inspect if there is a range of values for +# a certain feature which is leading to a particular result. +# +# As in the parallel coordinates plot, it is possible to select one or more +# ranges of values by clicking and holding on any axis of the plot. You can then +# slide (move) the range selection and cross two selections to see the +# intersections. You can undo a selection by clicking once again on the same +# axis. +# +# In particular for this dataset we observe that values of `"age"` lower to 20 +# years are quite predictive of low-income, regardless of the value of other +# features. Similarly, a `"capital-loss"` above `4000` seems to lead to +# low-income. +# +# Even if it is beyond the scope of the present MOOC, one can additionally +# explore correlations between features, for example, using Spearman's rank +# correlation, as the more popular Pearson's correlation is only appropriate for +# continuous data that is normally distributed and linearly related. Spearman's +# correlation is more versatile in dealing with non-linear relationships and +# ordinal data, but it is not meant for nominal categorical data. + +# %% +import matplotlib.pyplot as plt +import numpy as np + +from scipy.cluster import hierarchy +from scipy.spatial.distance import squareform +from scipy.stats import spearmanr + +# Keep numerical features only +X = adult_census[columns_to_plot].drop(columns=["class", "relationship"]) +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) +corr = spearmanr(X).correlation + +# Ensure the correlation matrix is symmetric +corr = (corr + corr.T) / 2 +np.fill_diagonal(corr, 1) + +# We convert the correlation matrix to a distance matrix before performing +# hierarchical clustering using Ward's linkage. +distance_matrix = 1 - np.abs(corr) +dist_linkage = hierarchy.ward(squareform(distance_matrix)) +dendro = hierarchy.dendrogram( + dist_linkage, labels=X.columns.to_list(), ax=ax1, leaf_rotation=90 +) +dendro_idx = np.arange(0, len(dendro["ivl"])) + +ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]], cmap="coolwarm") +ax2.set_xticks(dendro_idx) +ax2.set_yticks(dendro_idx) +ax2.set_xticklabels(dendro["ivl"], rotation="vertical") +ax2.set_yticklabels(dendro["ivl"]) +_ = fig.tight_layout() + +# %% [markdown] +# Using a [diverging +# colormap](https://matplotlib.org/stable/users/explain/colors/colormaps.html#diverging) +# such as "coolwarm", the softer the color, the less (anti)correlation between +# features (no correlation is mapped to white color). In this case dark blue +# represents strong negative correlations and dark red means strong positive +# correlations. Indeed, any feature is perfectly correlated to itself. diff --git a/python_scripts/linear_models_ex_03.py b/python_scripts/linear_models_ex_03.py index 50fe942cd..bede1ad85 100644 --- a/python_scripts/linear_models_ex_03.py +++ b/python_scripts/linear_models_ex_03.py @@ -79,7 +79,7 @@ # Write your code here. # %% [markdown] -# For the following questions, you can copy adn paste the following snippet to +# For the following questions, you can copy and paste the following snippet to # get the feature names from the column transformer here named `preprocessor`. # # ```python diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index c76806a45..a37bbf2fb 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -145,7 +145,7 @@ ) # %% [markdown] -# For the following questions, you can copy adn paste the following snippet to +# For the following questions, you can copy and paste the following snippet to # get the feature names from the column transformer here named `preprocessor`. # # ```python