From 09ad771674fc8c75033a2a86ea69d166f38443b1 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:48:28 +0200 Subject: [PATCH] MAINT Remove head method (#766) Co-authored-by: ArturoAmorQ --- python_scripts/01_tabular_data_exploration.py | 9 +++++++++ python_scripts/02_numerical_pipeline_hands_on.py | 8 ++++---- python_scripts/02_numerical_pipeline_introduction.py | 4 ++-- python_scripts/03_categorical_pipeline.py | 4 ++-- .../03_categorical_pipeline_column_transformer.py | 2 +- python_scripts/cross_validation_train_test.py | 6 +++--- python_scripts/linear_models_ex_02.py | 2 +- python_scripts/linear_models_sol_02.py | 2 +- python_scripts/linear_regression_without_sklearn.py | 2 +- python_scripts/parameter_tuning_grid_search.py | 4 ++-- python_scripts/parameter_tuning_manual.py | 2 +- python_scripts/parameter_tuning_randomized_search.py | 2 +- python_scripts/trees_dataset.py | 2 +- 13 files changed, 29 insertions(+), 20 deletions(-) diff --git a/python_scripts/01_tabular_data_exploration.py b/python_scripts/01_tabular_data_exploration.py index 4b07c4add..3441f490f 100644 --- a/python_scripts/01_tabular_data_exploration.py +++ b/python_scripts/01_tabular_data_exploration.py @@ -70,6 +70,15 @@ # %% adult_census.head() +# %% [markdown] +# An alternative is to omit the `head` method. This would output the intial and +# final rows and columns, but everything in between is not shown by default. It +# also provides the dataframe's dimensions at the bottom in the format `n_rows` +# x `n_columns`. + +# %% +adult_census + # %% [markdown] # The column named **class** is our target variable (i.e., the variable which we # want to predict). The two possible classes are `<=50K` (low-revenue) and diff --git a/python_scripts/02_numerical_pipeline_hands_on.py b/python_scripts/02_numerical_pipeline_hands_on.py index 913b78105..ba4212017 100644 --- a/python_scripts/02_numerical_pipeline_hands_on.py +++ b/python_scripts/02_numerical_pipeline_hands_on.py @@ -34,7 +34,7 @@ adult_census = pd.read_csv("../datasets/adult-census.csv") # drop the duplicated column `"education-num"` as stated in the first notebook adult_census = adult_census.drop(columns="education-num") -adult_census.head() +adult_census # %% [markdown] # The next step separates the target from the data. We performed the same @@ -44,7 +44,7 @@ data, target = adult_census.drop(columns="class"), adult_census["class"] # %% -data.head() +data # %% target @@ -95,7 +95,7 @@ # the `object` data type. # %% -data.head() +data # %% [markdown] # We see that the `object` data type corresponds to columns containing strings. @@ -105,7 +105,7 @@ # %% numerical_columns = ["age", "capital-gain", "capital-loss", "hours-per-week"] -data[numerical_columns].head() +data[numerical_columns] # %% [markdown] # Now that we limited the dataset to numerical columns only, we can analyse diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py index ca56a13fb..8a245611a 100644 --- a/python_scripts/02_numerical_pipeline_introduction.py +++ b/python_scripts/02_numerical_pipeline_introduction.py @@ -39,7 +39,7 @@ # Let's have a look at the first records of this dataframe: # %% -adult_census.head() +adult_census # %% [markdown] # We see that this CSV file contains all information: the target that we would @@ -56,7 +56,7 @@ # %% data = adult_census.drop(columns=[target_name]) -data.head() +data # %% [markdown] # We can now linger on the variables, also denominated features, that we later diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py index 62cd9be98..64b516070 100644 --- a/python_scripts/03_categorical_pipeline.py +++ b/python_scripts/03_categorical_pipeline.py @@ -81,7 +81,7 @@ # %% data_categorical = data[categorical_columns] -data_categorical.head() +data_categorical # %% print(f"The dataset is composed of {data_categorical.shape[1]} features") @@ -194,7 +194,7 @@ # %% print(f"The dataset is composed of {data_categorical.shape[1]} features") -data_categorical.head() +data_categorical # %% data_encoded = encoder.fit_transform(data_categorical) diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py index fd429749e..3ee06fad7 100644 --- a/python_scripts/03_categorical_pipeline_column_transformer.py +++ b/python_scripts/03_categorical_pipeline_column_transformer.py @@ -165,7 +165,7 @@ # method. As an example, we predict on the five first samples from the test set. # %% -data_test.head() +data_test # %% model.predict(data_test)[:5] diff --git a/python_scripts/cross_validation_train_test.py b/python_scripts/cross_validation_train_test.py index a11cbd587..f5bd73b01 100644 --- a/python_scripts/cross_validation_train_test.py +++ b/python_scripts/cross_validation_train_test.py @@ -41,7 +41,7 @@ print(housing.DESCR) # %% -data.head() +data # %% [markdown] # To simplify future visualization, let's transform the prices from the 100 @@ -49,7 +49,7 @@ # %% target *= 100 -target.head() +target # %% [markdown] # ```{note} @@ -218,7 +218,7 @@ import pandas as pd cv_results = pd.DataFrame(cv_results) -cv_results.head() +cv_results # %% [markdown] # ```{tip} diff --git a/python_scripts/linear_models_ex_02.py b/python_scripts/linear_models_ex_02.py index fdfdaf610..12a78bc5a 100644 --- a/python_scripts/linear_models_ex_02.py +++ b/python_scripts/linear_models_ex_02.py @@ -52,7 +52,7 @@ data = penguins_non_missing[columns] target = penguins_non_missing[target_name] -data.head() +data # %% [markdown] # Now it is your turn to train a linear regression model on this dataset. First, diff --git a/python_scripts/linear_models_sol_02.py b/python_scripts/linear_models_sol_02.py index 03aa72005..0b0717f10 100644 --- a/python_scripts/linear_models_sol_02.py +++ b/python_scripts/linear_models_sol_02.py @@ -46,7 +46,7 @@ data = penguins_non_missing[columns] target = penguins_non_missing[target_name] -data.head() +data # %% [markdown] # Now it is your turn to train a linear regression model on this dataset. First, diff --git a/python_scripts/linear_regression_without_sklearn.py b/python_scripts/linear_regression_without_sklearn.py index acc06a0ec..a83c0cf4b 100644 --- a/python_scripts/linear_regression_without_sklearn.py +++ b/python_scripts/linear_regression_without_sklearn.py @@ -22,7 +22,7 @@ import pandas as pd penguins = pd.read_csv("../datasets/penguins_regression.csv") -penguins.head() +penguins # %% [markdown] # We aim to solve the following problem: using the flipper length of a penguin, diff --git a/python_scripts/parameter_tuning_grid_search.py b/python_scripts/parameter_tuning_grid_search.py index 12bbffb57..39de4251d 100644 --- a/python_scripts/parameter_tuning_grid_search.py +++ b/python_scripts/parameter_tuning_grid_search.py @@ -36,7 +36,7 @@ # %% data = adult_census.drop(columns=[target_name, "education-num"]) -data.head() +data # %% [markdown] # Once the dataset is loaded, we split it into a training and testing sets. @@ -193,7 +193,7 @@ cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values( "mean_test_score", ascending=False ) -cv_results.head() +cv_results # %% [markdown] # Let us focus on the most interesting columns and shorten the parameter names diff --git a/python_scripts/parameter_tuning_manual.py b/python_scripts/parameter_tuning_manual.py index 15d047a80..d010a52dc 100644 --- a/python_scripts/parameter_tuning_manual.py +++ b/python_scripts/parameter_tuning_manual.py @@ -38,7 +38,7 @@ # Our data is only numerical. # %% -data.head() +data # %% [markdown] # Let's create a simple predictive model made of a scaler followed by a logistic diff --git a/python_scripts/parameter_tuning_randomized_search.py b/python_scripts/parameter_tuning_randomized_search.py index b146b832d..0bcd4761d 100644 --- a/python_scripts/parameter_tuning_randomized_search.py +++ b/python_scripts/parameter_tuning_randomized_search.py @@ -44,7 +44,7 @@ # %% data = adult_census.drop(columns=[target_name, "education-num"]) -data.head() +data # %% [markdown] # Once the dataset is loaded, we split it into a training and testing sets. diff --git a/python_scripts/trees_dataset.py b/python_scripts/trees_dataset.py index 888eee5a7..457e85c3c 100644 --- a/python_scripts/trees_dataset.py +++ b/python_scripts/trees_dataset.py @@ -48,7 +48,7 @@ # Let's check the dataset more into details. # %% -penguins.head() +penguins # %% [markdown] # Since that we have few samples, we can check a scatter plot to observe the