From f3bc5e9fdc761ff9fa6f2728dc73499d55cdb174 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Mon, 12 Feb 2024 12:33:47 +0200 Subject: [PATCH 01/10] Initial implementations for raster and vector normality test CLI functions --- eis_toolkit/cli.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index f31ba0c6..f979ca99 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -267,6 +267,48 @@ class NodataHandling(str, Enum): # --- EXPLORATORY ANALYSES --- +# NORMALITY TEST RASTER +@app.command() +def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): + """Compute Shapiro-Wilk test for normality on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import normality_test + + typer.echo("Progress: 10%") + + with rasterio.open(input_raster) as raster: + data = raster.read() + typer.echo("Progress: 25%") + + results_dict = normality_test(data) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Normality test (raster) completed") + + +# NORMALITY TEST VECTOR +@app.command() +def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None): + """Compute Shapiro-Wilk test for normality on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import normality_test + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + typer.echo("Progress: 25%") + + results_dict = normality_test(geodataframe, columns) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Normality test (vector) completed") + # DBSCAN @app.command() From 1c562025a41973f6c64b1ad3453102b637ef6eb8 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 10:42:10 +0200 Subject: [PATCH 02/10] Added CLI functions for normality tests. Adjusted toolkit func implementations --- eis_toolkit/cli.py | 16 +++++++++------- .../exploratory_analyses/normality_test.py | 17 ++++++++++------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index a9e62631..d660591f 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -276,9 +276,9 @@ class LocalMoranWeightType(str, Enum): # NORMALITY TEST RASTER @app.command() -def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): - """Compute Shapiro-Wilk test for normality on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import normality_test +def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], bands: Optional[List[int]] = None): + """Compute Shapiro-Wilk test for normality on the input raster data.""" + from eis_toolkit.exploratory_analyses.normality_test import normality_test_array typer.echo("Progress: 10%") @@ -286,7 +286,9 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): data = raster.read() typer.echo("Progress: 25%") - results_dict = normality_test(data) + if len(bands) == 0: + bands = None + results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata) typer.echo("Progress: 75%") @@ -299,15 +301,15 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION]): # NORMALITY TEST VECTOR @app.command() def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None): - """Compute Shapiro-Wilk test for normality on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import normality_test + """Compute Shapiro-Wilk test for normality on the input vector data.""" + from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe typer.echo("Progress: 10%") geodataframe = gpd.read_file(input_vector) typer.echo("Progress: 25%") - results_dict = normality_test(geodataframe, columns) + results_dict = normality_test_dataframe(geodataframe, columns) typer.echo("Progress: 75%") diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py index 4034d2b1..d3507fe7 100644 --- a/eis_toolkit/exploratory_analyses/normality_test.py +++ b/eis_toolkit/exploratory_analyses/normality_test.py @@ -59,7 +59,8 @@ def normality_test_dataframe( for column in columns: if len(data[column]) > 5000: raise SampleSizeExceededException(f"Sample size for column '{column}' exceeds the limit of 5000 samples.") - statistics[column] = shapiro(data[column]) + stat, p_value = shapiro(data[column]) + statistics[column] = {"Statistic": stat, "p-value": p_value} return statistics @@ -67,7 +68,7 @@ def normality_test_dataframe( @beartype def normality_test_array( data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None -) -> Dict[int, Tuple[float, float]]: +) -> Dict[str, Tuple[float, float]]: """ Compute Shapiro-Wilk test for normality on the input Numpy array. @@ -94,14 +95,14 @@ def normality_test_array( if data.ndim == 1 or data.ndim == 2: prepared_data = np.expand_dims(data, axis=0) - bands = range(1) + bands = [1] elif data.ndim == 3: if bands is not None: - if not all(band < len(data) for band in bands): + if not all(band - 1 < len(data) for band in bands): raise InvalidRasterBandException("All selected bands were not found in the input array.") else: - bands = range(len(data)) + bands = range(1, len(data) + 1) prepared_data = data else: @@ -110,7 +111,8 @@ def normality_test_array( statistics = {} for band in bands: - flattened_data = prepared_data[band].ravel() + band_idx = band - 1 + flattened_data = prepared_data[band_idx].ravel() nan_mask = flattened_data == np.nan if nodata_value is not None: @@ -121,6 +123,7 @@ def normality_test_array( if len(masked_data) > 5000: raise SampleSizeExceededException(f"Sample size for band '{band}' exceeds the limit of 5000 samples.") - statistics[band] = shapiro(masked_data) + stat, p_value = shapiro(masked_data) + statistics[f"Band {band}"] = {"Statistic": stat, "p-value": p_value} return statistics From 6cd07031668b852ada4540d4a97e1907b613fe96 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 11:10:47 +0200 Subject: [PATCH 03/10] Added CLI function for chi-square test, adjusted toolkit func --- eis_toolkit/cli.py | 29 +++++++++++++++-- .../exploratory_analyses/statistical_tests.py | 31 ++++++++++--------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index d660591f..23c90010 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -300,7 +300,7 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], # NORMALITY TEST VECTOR @app.command() -def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: List[str] = None): +def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], columns: Optional[List[str]] = None): """Compute Shapiro-Wilk test for normality on the input vector data.""" from eis_toolkit.exploratory_analyses.normality_test import normality_test_dataframe @@ -309,7 +309,7 @@ def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], geodataframe = gpd.read_file(input_vector) typer.echo("Progress: 25%") - results_dict = normality_test_dataframe(geodataframe, columns) + results_dict = normality_test_dataframe(data=geodataframe, columns=columns) typer.echo("Progress: 75%") @@ -319,6 +319,31 @@ def normality_test_vector_cli(input_vector: Annotated[Path, INPUT_FILE_OPTION], typer.echo("Normality test (vector) completed") +# CHI-SQUARE_TEST +@app.command() +def chi_square_test_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + target_column: str = typer.Option(), + columns: Optional[List[str]] = None, +): + """Perform a Chi-square test of independence between a target variable and one or more other variables.""" + from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + typer.echo("Progress: 25%") + + results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns) + + typer.echo("Progress: 75%") + + json_str = json.dumps(results_dict) + typer.echo("Progress: 100%") + typer.echo(f"Results: {json_str}") + typer.echo("Chi-square test completed") + + # DBSCAN @app.command() def dbscan_cli( diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py index c7f80117..7a64b3ae 100644 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ b/eis_toolkit/exploratory_analyses/statistical_tests.py @@ -9,22 +9,24 @@ @beartype def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None) -> dict: - """Compute Chi-square test for independence on the input data. + """Perform a Chi-square test of independence between a target variable and one or more other variables. - It is assumed that the variables in the input data are independent and that they are categorical, i.e. strings, - booleans or integers, but not floats. + Input data should be categorical data. Continuous data or non-categorical data should be discretized or + binned before using this function, as Chi-square tests are not applicable to continuous variables directly. + + The test assumes that the observed frequencies in each category are independent. Args: - data: Dataframe containing the input data + data: Dataframe containing the input data. target_column: Variable against which independence of other variables is tested. columns: Variables that are tested against the variable in target_column. If None, every column is used. Returns: - Test statistics for each variable (except target_column). + Test statistics, p-value and degrees of freedom for each variable. Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: The target_column is not in input Dataframe or invalid column is provided. + EmptyDataFrameException: Input Dataframe is empty. + InvalidParameterValueException: Invalid column is input. """ if check_empty_dataframe(data): raise EmptyDataFrameException("The input Dataframe is empty.") @@ -32,19 +34,18 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se if not check_columns_valid(data, [target_column]): raise InvalidParameterValueException("Target column not found in the Dataframe.") - if columns is not None: + if columns: invalid_columns = [column for column in columns if column not in data.columns] - if any(invalid_columns): - raise InvalidParameterValueException(f"The following variables are not in the dataframe: {invalid_columns}") + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") else: - columns = data.columns + columns = [col for col in data.columns if col != target_column] statistics = {} for column in columns: - if column != target_column: - contingency_table = pd.crosstab(data[target_column], data[column]) - chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) - statistics[column] = (chi_square, p_value, degrees_of_freedom) + contingency_table = pd.crosstab(data[target_column], data[column]) + chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) + statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom} return statistics From 4ede1878aa753dd84f686293d656f34a19ad523a Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 12:12:49 +0200 Subject: [PATCH 04/10] Added CLI functions for covariance and correlation matrices, adjusted toolkit implementations --- eis_toolkit/cli.py | 70 ++++++++++++++++++- .../exploratory_analyses/statistical_tests.py | 37 ++++++++-- 2 files changed, 99 insertions(+), 8 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index 23c90010..f5bdd5ee 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -207,6 +207,14 @@ class LocalMoranWeightType(str, Enum): knn = "knn" +class CorrelationMethod(str, Enum): + """Correlation methods available.""" + + pearson = "pearson" + kendall = "kendall" + spearman = "spearman" + + RESAMPLING_MAPPING = { "nearest": warp.Resampling.nearest, "bilinear": warp.Resampling.bilinear, @@ -331,7 +339,7 @@ def chi_square_test_cli( typer.echo("Progress: 10%") - geodataframe = gpd.read_file(input_vector) + geodataframe = gpd.read_file(input_vector) # Should we drop geometry columns? typer.echo("Progress: 25%") results_dict = chi_square_test(data=geodataframe, target_column=target_column, columns=columns) @@ -344,6 +352,66 @@ def chi_square_test_cli( typer.echo("Chi-square test completed") +# CORRELATION MATRIX +@app.command() +def correlation_matrix_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_file: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, + correlation_method: CorrelationMethod = CorrelationMethod.pearson, + min_periods: Optional[int] = None, +): + """Compute correlation matrix on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import correlation_matrix + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + dataframe = pd.DataFrame(geodataframe.drop(columns="geometry")) + typer.echo("Progress: 25%") + + output_df = correlation_matrix( + data=dataframe, columns=columns, correlation_method=correlation_method, min_periods=min_periods + ) + + typer.echo("Progress: 75%") + + output_df.to_csv(output_file) + typer.echo("Progress: 100%") + + typer.echo("Correlation matrix completed") + + +# COVARIANCE MATRIX +@app.command() +def covariance_matrix_cli( + input_vector: Annotated[Path, INPUT_FILE_OPTION], + output_file: Annotated[Path, OUTPUT_FILE_OPTION], + columns: Optional[List[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, +): + """Compute covariance matrix on the input data.""" + from eis_toolkit.exploratory_analyses.statistical_tests import covariance_matrix + + typer.echo("Progress: 10%") + + geodataframe = gpd.read_file(input_vector) + dataframe = pd.DataFrame(geodataframe.drop(columns="geometry")) + typer.echo("Progress: 25%") + + output_df = covariance_matrix( + data=dataframe, columns=columns, min_periods=min_periods, delta_degrees_of_freedom=delta_degrees_of_freedom + ) + + typer.echo("Progress: 75%") + + output_df.to_csv(output_file) + typer.echo("Progress: 100%") + + typer.echo("Covariance matrix completed") + + # DBSCAN @app.command() def dbscan_cli( diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py index 7a64b3ae..8fe5721f 100644 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ b/eis_toolkit/exploratory_analyses/statistical_tests.py @@ -1,10 +1,11 @@ +import numpy as np import pandas as pd from beartype import beartype from beartype.typing import Literal, Optional, Sequence from scipy.stats import chi2_contingency from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException -from eis_toolkit.utilities.checks.dataframe import check_columns_numeric, check_columns_valid, check_empty_dataframe +from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe @beartype @@ -53,6 +54,7 @@ def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Se @beartype def correlation_matrix( data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", min_periods: Optional[int] = None, ) -> pd.DataFrame: @@ -62,6 +64,7 @@ def correlation_matrix( Args: data: Dataframe containing the input data. + columns: Columns to include in the correlation matrix. If None, all numeric columns are used. correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. @@ -72,12 +75,20 @@ def correlation_matrix( Raises: EmptyDataFrameException: The input Dataframe is empty. InvalidParameterValueException: min_periods argument is used with method 'kendall'. - NonNumericDataException: The input data contain non-numeric data. + NonNumericDataException: The selected columns contain non-numeric data. """ if check_empty_dataframe(data): raise EmptyDataFrameException("The input Dataframe is empty.") - if not check_columns_numeric(data, data.columns.to_list()): + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): raise NonNumericDataException("The input data contain non-numeric data.") if correlation_method == "kendall" and min_periods is not None: @@ -85,14 +96,17 @@ def correlation_matrix( "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." ) - matrix = data.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) + matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) return matrix @beartype def covariance_matrix( - data: pd.DataFrame, min_periods: Optional[int] = None, delta_degrees_of_freedom: int = 1 + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, ) -> pd.DataFrame: """Compute covariance matrix on the input data. @@ -100,6 +114,7 @@ def covariance_matrix( Args: data: Dataframe containing the input data. + columns: Columns to include in the covariance matrix. If None, all numeric columns are used. min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. @@ -114,7 +129,15 @@ def covariance_matrix( if check_empty_dataframe(data): raise EmptyDataFrameException("The input Dataframe is empty.") - if not check_columns_numeric(data, data.columns.to_list()): + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): raise NonNumericDataException("The input data contain non-numeric data.") if delta_degrees_of_freedom < 0: @@ -123,6 +146,6 @@ def covariance_matrix( if min_periods and min_periods < 0: raise InvalidParameterValueException("Min perioids must be non-negative.") - matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) + matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) return matrix From a4d367018046249e79da70e248f65cc6ecad580e Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Fri, 23 Feb 2024 12:39:35 +0200 Subject: [PATCH 05/10] Update tests for statistical functions, typing fixes --- eis_toolkit/exploratory_analyses/normality_test.py | 6 +++--- .../exploratory_analyses/statistical_tests.py | 6 ++++-- tests/exploratory_analyses/normality_test_test.py | 14 +++++++------- .../exploratory_analyses/statistical_tests_test.py | 4 ++-- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py index d3507fe7..8e8fcad0 100644 --- a/eis_toolkit/exploratory_analyses/normality_test.py +++ b/eis_toolkit/exploratory_analyses/normality_test.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Dict, Optional, Sequence, Tuple +from beartype.typing import Dict, Optional, Sequence from scipy.stats import shapiro from eis_toolkit.exceptions import ( @@ -20,7 +20,7 @@ @beartype def normality_test_dataframe( data: pd.DataFrame, columns: Optional[Sequence[str]] = None -) -> Dict[str, Tuple[float, float]]: +) -> Dict[str, Dict[str, float]]: """ Compute Shapiro-Wilk test for normality on the input DataFrame. @@ -68,7 +68,7 @@ def normality_test_dataframe( @beartype def normality_test_array( data: np.ndarray, bands: Optional[Sequence[int]] = None, nodata_value: Optional[Number] = None -) -> Dict[str, Tuple[float, float]]: +) -> Dict[str, Dict[str, float]]: """ Compute Shapiro-Wilk test for normality on the input Numpy array. diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py index 8fe5721f..518492fe 100644 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ b/eis_toolkit/exploratory_analyses/statistical_tests.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd from beartype import beartype -from beartype.typing import Literal, Optional, Sequence +from beartype.typing import Dict, Literal, Optional, Sequence from scipy.stats import chi2_contingency from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException @@ -9,7 +9,9 @@ @beartype -def chi_square_test(data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None) -> dict: +def chi_square_test( + data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None +) -> Dict[str, Dict[str, float]]: """Perform a Chi-square test of independence between a target variable and one or more other variables. Input data should be categorical data. Continuous data or non-categorical data should be discretized or diff --git a/tests/exploratory_analyses/normality_test_test.py b/tests/exploratory_analyses/normality_test_test.py index c2012578..dcf3f5b4 100644 --- a/tests/exploratory_analyses/normality_test_test.py +++ b/tests/exploratory_analyses/normality_test_test.py @@ -24,35 +24,35 @@ def test_normality_test_dataframe(): """Test that returned normality statistics for DataFrame data are correct.""" output_statistics = normality_test_dataframe(data=DATA_DF, columns=["a"]) - np.testing.assert_array_almost_equal(output_statistics["a"], (0.82827, 0.13502), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), [0.82827, 0.13502], decimal=5) def test_normality_test_array(): """Test that returned normality statistics for Numpy array data are correct.""" # 3D array - output_statistics = normality_test_array(data=DATA_ARRAY, bands=[0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + output_statistics = normality_test_array(data=DATA_ARRAY, bands=[1]) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) # 2D array output_statistics = normality_test_array(data=DATA_ARRAY[0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) # 1D array output_statistics = normality_test_array(data=DATA_ARRAY[0][0]) - np.testing.assert_array_almost_equal(output_statistics[0], (0.9067, 0.41504), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.9067, 0.41504], decimal=5) def test_normality_test_dataframe_missing_data(): """Test that DataFrame input with missing data returns statistics correctly.""" df_with_nan = DATA_DF.replace(3, np.nan) output_statistics = normality_test_dataframe(data=df_with_nan, columns=["a"]) - np.testing.assert_array_almost_equal(output_statistics["a"], (0.62978, 0.00124), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["a"].values()), (0.62978, 0.00124), decimal=5) def test_normality_test_array_nodata(): """Test that Numpy array input with missing data returns statistics correctly.""" output_statistics = normality_test_array(data=DATA_ARRAY, nodata_value=3) - np.testing.assert_array_almost_equal(output_statistics[0], (0.91021, 0.01506), decimal=5) + np.testing.assert_array_almost_equal(list(output_statistics["Band 1"].values()), [0.91021, 0.01506], decimal=5) def test_invalid_selection(): diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py index 71954c0f..03c68d7e 100644 --- a/tests/exploratory_analyses/statistical_tests_test.py +++ b/tests/exploratory_analyses/statistical_tests_test.py @@ -22,7 +22,7 @@ def test_chi_square_test(): """Test that returned statistics for independence are correct.""" output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=["f"]) - np.testing.assert_array_equal((output_statistics["f"]), (0.0, 1.0, 1)) + np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1]) def test_correlation_matrix_nan(): @@ -56,7 +56,7 @@ def test_correlation_matrix(): def test_correlation_matrix_non_numeric(): """Test that returned correlation matrix is correct.""" with pytest.raises(NonNumericDataException): - correlation_matrix(data=non_numeric_df) + correlation_matrix(data=non_numeric_df, columns=["a", "b"]) def test_covariance_matrix_nan(): From 62fe65430e7d74db5a9e3f410375eea55a52fef6 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Thu, 29 Feb 2024 09:32:29 +0200 Subject: [PATCH 06/10] Separated correlation matrix, covariance matrix and chi2 tests into own modules --- eis_toolkit/cli.py | 6 +- .../exploratory_analyses/chi_square_test.py | 52 ++++++ .../correlation_matrix.py | 57 +++++++ .../exploratory_analyses/covariance_matrix.py | 57 +++++++ .../exploratory_analyses/statistical_tests.py | 153 ------------------ tests/exploratory_analyses/chi_square_test.py | 20 +++ .../correlation_matrix_test.py | 53 ++++++ .../covariance_matrix_test.py | 56 +++++++ .../statistical_tests_test.py | 117 -------------- 9 files changed, 298 insertions(+), 273 deletions(-) create mode 100644 eis_toolkit/exploratory_analyses/chi_square_test.py create mode 100644 eis_toolkit/exploratory_analyses/correlation_matrix.py create mode 100644 eis_toolkit/exploratory_analyses/covariance_matrix.py delete mode 100644 eis_toolkit/exploratory_analyses/statistical_tests.py create mode 100644 tests/exploratory_analyses/chi_square_test.py create mode 100644 tests/exploratory_analyses/correlation_matrix_test.py create mode 100644 tests/exploratory_analyses/covariance_matrix_test.py delete mode 100644 tests/exploratory_analyses/statistical_tests_test.py diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index f5bdd5ee..ad070e98 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -335,7 +335,7 @@ def chi_square_test_cli( columns: Optional[List[str]] = None, ): """Perform a Chi-square test of independence between a target variable and one or more other variables.""" - from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test + from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test typer.echo("Progress: 10%") @@ -362,7 +362,7 @@ def correlation_matrix_cli( min_periods: Optional[int] = None, ): """Compute correlation matrix on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import correlation_matrix + from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix typer.echo("Progress: 10%") @@ -392,7 +392,7 @@ def covariance_matrix_cli( delta_degrees_of_freedom: int = 1, ): """Compute covariance matrix on the input data.""" - from eis_toolkit.exploratory_analyses.statistical_tests import covariance_matrix + from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix typer.echo("Progress: 10%") diff --git a/eis_toolkit/exploratory_analyses/chi_square_test.py b/eis_toolkit/exploratory_analyses/chi_square_test.py new file mode 100644 index 00000000..cf82aa25 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/chi_square_test.py @@ -0,0 +1,52 @@ +import pandas as pd +from beartype import beartype +from beartype.typing import Dict, Optional, Sequence +from scipy.stats import chi2_contingency + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException +from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe + + +@beartype +def chi_square_test( + data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None +) -> Dict[str, Dict[str, float]]: + """Perform a Chi-square test of independence between a target variable and one or more other variables. + + Input data should be categorical data. Continuous data or non-categorical data should be discretized or + binned before using this function, as Chi-square tests are not applicable to continuous variables directly. + + The test assumes that the observed frequencies in each category are independent. + + Args: + data: Dataframe containing the input data. + target_column: Variable against which independence of other variables is tested. + columns: Variables that are tested against the variable in target_column. If None, every column is used. + + Returns: + Test statistics, p-value and degrees of freedom for each variable. + + Raises: + EmptyDataFrameException: Input Dataframe is empty. + InvalidParameterValueException: Invalid column is input. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if not check_columns_valid(data, [target_column]): + raise InvalidParameterValueException("Target column not found in the Dataframe.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + else: + columns = [col for col in data.columns if col != target_column] + + statistics = {} + for column in columns: + contingency_table = pd.crosstab(data[target_column], data[column]) + chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) + statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom} + + return statistics diff --git a/eis_toolkit/exploratory_analyses/correlation_matrix.py b/eis_toolkit/exploratory_analyses/correlation_matrix.py new file mode 100644 index 00000000..bdcb79f3 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/correlation_matrix.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Literal, Optional, Sequence + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException +from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe + + +@beartype +def correlation_matrix( + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", + min_periods: Optional[int] = None, +) -> pd.DataFrame: + """Compute correlation matrix on the input data. + + It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. + + Args: + data: Dataframe containing the input data. + columns: Columns to include in the correlation matrix. If None, all numeric columns are used. + correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. + min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. + + Returns: + Dataframe containing matrix representing the correlation coefficient \ + between the corresponding pair of variables. + + Raises: + EmptyDataFrameException: The input Dataframe is empty. + InvalidParameterValueException: min_periods argument is used with method 'kendall'. + NonNumericDataException: The selected columns contain non-numeric data. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): + raise NonNumericDataException("The input data contain non-numeric data.") + + if correlation_method == "kendall" and min_periods is not None: + raise InvalidParameterValueException( + "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." + ) + + matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) + + return matrix diff --git a/eis_toolkit/exploratory_analyses/covariance_matrix.py b/eis_toolkit/exploratory_analyses/covariance_matrix.py new file mode 100644 index 00000000..25c63850 --- /dev/null +++ b/eis_toolkit/exploratory_analyses/covariance_matrix.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd +from beartype import beartype +from beartype.typing import Optional, Sequence + +from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException +from eis_toolkit.utilities.checks.dataframe import check_empty_dataframe + + +@beartype +def covariance_matrix( + data: pd.DataFrame, + columns: Optional[Sequence[str]] = None, + min_periods: Optional[int] = None, + delta_degrees_of_freedom: int = 1, +) -> pd.DataFrame: + """Compute covariance matrix on the input data. + + It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. + + Args: + data: Dataframe containing the input data. + columns: Columns to include in the covariance matrix. If None, all numeric columns are used. + min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. + delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. + + Returns: + Dataframe containing matrix representing the covariance between the corresponding pair of variables. + + Raises: + EmptyDataFrameException: The input Dataframe is empty. + InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative. + NonNumericDataException: The input data contain non-numeric data. + """ + if check_empty_dataframe(data): + raise EmptyDataFrameException("The input Dataframe is empty.") + + if columns: + invalid_columns = [column for column in columns if column not in data.columns] + if invalid_columns: + raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") + data_subset = data[columns] + else: + data_subset = data.select_dtypes(include=np.number) + + if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): + raise NonNumericDataException("The input data contain non-numeric data.") + + if delta_degrees_of_freedom < 0: + raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.") + + if min_periods and min_periods < 0: + raise InvalidParameterValueException("Min perioids must be non-negative.") + + matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) + + return matrix diff --git a/eis_toolkit/exploratory_analyses/statistical_tests.py b/eis_toolkit/exploratory_analyses/statistical_tests.py deleted file mode 100644 index 518492fe..00000000 --- a/eis_toolkit/exploratory_analyses/statistical_tests.py +++ /dev/null @@ -1,153 +0,0 @@ -import numpy as np -import pandas as pd -from beartype import beartype -from beartype.typing import Dict, Literal, Optional, Sequence -from scipy.stats import chi2_contingency - -from eis_toolkit.exceptions import EmptyDataFrameException, InvalidParameterValueException, NonNumericDataException -from eis_toolkit.utilities.checks.dataframe import check_columns_valid, check_empty_dataframe - - -@beartype -def chi_square_test( - data: pd.DataFrame, target_column: str, columns: Optional[Sequence[str]] = None -) -> Dict[str, Dict[str, float]]: - """Perform a Chi-square test of independence between a target variable and one or more other variables. - - Input data should be categorical data. Continuous data or non-categorical data should be discretized or - binned before using this function, as Chi-square tests are not applicable to continuous variables directly. - - The test assumes that the observed frequencies in each category are independent. - - Args: - data: Dataframe containing the input data. - target_column: Variable against which independence of other variables is tested. - columns: Variables that are tested against the variable in target_column. If None, every column is used. - - Returns: - Test statistics, p-value and degrees of freedom for each variable. - - Raises: - EmptyDataFrameException: Input Dataframe is empty. - InvalidParameterValueException: Invalid column is input. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if not check_columns_valid(data, [target_column]): - raise InvalidParameterValueException("Target column not found in the Dataframe.") - - if columns: - invalid_columns = [column for column in columns if column not in data.columns] - if invalid_columns: - raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") - else: - columns = [col for col in data.columns if col != target_column] - - statistics = {} - for column in columns: - contingency_table = pd.crosstab(data[target_column], data[column]) - chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table) - statistics[column] = {"chi_square": chi_square, "p-value": p_value, "degrees_of_freedom": degrees_of_freedom} - - return statistics - - -@beartype -def correlation_matrix( - data: pd.DataFrame, - columns: Optional[Sequence[str]] = None, - correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson", - min_periods: Optional[int] = None, -) -> pd.DataFrame: - """Compute correlation matrix on the input data. - - It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. - - Args: - data: Dataframe containing the input data. - columns: Columns to include in the correlation matrix. If None, all numeric columns are used. - correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'. - min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. - - Returns: - Dataframe containing matrix representing the correlation coefficient \ - between the corresponding pair of variables. - - Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: min_periods argument is used with method 'kendall'. - NonNumericDataException: The selected columns contain non-numeric data. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if columns: - invalid_columns = [column for column in columns if column not in data.columns] - if invalid_columns: - raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") - data_subset = data[columns] - else: - data_subset = data.select_dtypes(include=np.number) - - if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): - raise NonNumericDataException("The input data contain non-numeric data.") - - if correlation_method == "kendall" and min_periods is not None: - raise InvalidParameterValueException( - "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'." - ) - - matrix = data_subset.corr(method=correlation_method, min_periods=min_periods, numeric_only=True) - - return matrix - - -@beartype -def covariance_matrix( - data: pd.DataFrame, - columns: Optional[Sequence[str]] = None, - min_periods: Optional[int] = None, - delta_degrees_of_freedom: int = 1, -) -> pd.DataFrame: - """Compute covariance matrix on the input data. - - It is assumed that the data is numeric, i.e. integers or floats. NaN values are excluded from the calculations. - - Args: - data: Dataframe containing the input data. - columns: Columns to include in the covariance matrix. If None, all numeric columns are used. - min_periods: Minimum number of observations required per pair of columns to have valid result. Optional. - delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1. - - Returns: - Dataframe containing matrix representing the covariance between the corresponding pair of variables. - - Raises: - EmptyDataFrameException: The input Dataframe is empty. - InvalidParameterValueException: Provided value for delta_degrees_of_freedom or min_periods is negative. - NonNumericDataException: The input data contain non-numeric data. - """ - if check_empty_dataframe(data): - raise EmptyDataFrameException("The input Dataframe is empty.") - - if columns: - invalid_columns = [column for column in columns if column not in data.columns] - if invalid_columns: - raise InvalidParameterValueException(f"Invalid columns: {invalid_columns}") - data_subset = data[columns] - else: - data_subset = data.select_dtypes(include=np.number) - - if not all(data_subset.dtypes.apply(lambda x: np.issubdtype(x, np.number))): - raise NonNumericDataException("The input data contain non-numeric data.") - - if delta_degrees_of_freedom < 0: - raise InvalidParameterValueException("Delta degrees of freedom must be non-negative.") - - if min_periods and min_periods < 0: - raise InvalidParameterValueException("Min perioids must be non-negative.") - - matrix = data_subset.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom) - - return matrix diff --git a/tests/exploratory_analyses/chi_square_test.py b/tests/exploratory_analyses/chi_square_test.py new file mode 100644 index 00000000..83385d47 --- /dev/null +++ b/tests/exploratory_analyses/chi_square_test.py @@ -0,0 +1,20 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidParameterValueException +from eis_toolkit.exploratory_analyses.chi_square_test import chi_square_test + +DATA = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]}) + + +def test_chi_square_test(): + """Test that returned statistics for independence are correct.""" + output_statistics = chi_square_test(data=DATA, target_column="e", columns=["f"]) + np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1]) + + +def test_invalid_target_column(): + """Test that invalid target column raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + chi_square_test(data=DATA, target_column="invalid_column") diff --git a/tests/exploratory_analyses/correlation_matrix_test.py b/tests/exploratory_analyses/correlation_matrix_test.py new file mode 100644 index 00000000..903fa48c --- /dev/null +++ b/tests/exploratory_analyses/correlation_matrix_test.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest +from beartype.roar import BeartypeCallHintParamViolation + +from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException +from eis_toolkit.exploratory_analyses.correlation_matrix import correlation_matrix +from tests.exploratory_analyses.covariance_matrix_test import DF, DF_NON_NUMERIC, DF_WITH_NAN + + +def test_correlation_matrix_nan(): + """Test that returned correlation matrix is correct, when NaN present in the dataframe.""" + expected_correlation_matrix = np.array( + [ + [1.000000, -0.577350, -1.000000, 1.000000], + [-0.577350, 1.000000, np.nan, -0.577350], + [-1.000000, np.nan, 1.000000, -1.000000], + [1.000000, -0.577350, -1.000000, 1.000000], + ] + ) + output_matrix = correlation_matrix(data=DF_WITH_NAN) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_correlation_matrix(): + """Test that returned correlation matrix is correct.""" + expected_correlation_matrix = np.array( + [ + [1.000000, -0.577350, -0.904534, 1.000000], + [-0.577350, 1.000000, 0.174078, -0.577350], + [-0.904534, 0.174078, 1.000000, -0.904534], + [1.000000, -0.577350, -0.904534, 1.000000], + ] + ) + output_matrix = correlation_matrix(data=DF) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_correlation_matrix_non_numeric(): + """Test that returned correlation matrix is correct.""" + with pytest.raises(NonNumericDataException): + correlation_matrix(data=DF_NON_NUMERIC, columns=["a", "b"]) + + +def test_invalid_correlation_method(): + """Test that invalid correlation method raises the correct exception.""" + with pytest.raises(BeartypeCallHintParamViolation): + correlation_matrix(data=DF, correlation_method="invalid_method") + + +def test_min_periods_with_kendall(): + """Test that min_periods with correlation_method 'kendall' raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + correlation_matrix(data=DF, correlation_method="kendall", min_periods=1) diff --git a/tests/exploratory_analyses/covariance_matrix_test.py b/tests/exploratory_analyses/covariance_matrix_test.py new file mode 100644 index 00000000..be69dd64 --- /dev/null +++ b/tests/exploratory_analyses/covariance_matrix_test.py @@ -0,0 +1,56 @@ +import numpy as np +import pandas as pd +import pytest + +from eis_toolkit.exceptions import InvalidParameterValueException +from eis_toolkit.exploratory_analyses.covariance_matrix import covariance_matrix + +DATA = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) +DF = pd.DataFrame(DATA, columns=["a", "b", "c", "d"]) +DF_NON_NUMERIC = pd.DataFrame( + data=np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]]), + columns=["a", "b", "c", "d"], +) +DF_WITH_NAN = pd.DataFrame( + data=np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]]), columns=["a", "b", "c", "d"] +) + + +def test_covariance_matrix_nan(): + """Test that returned covariance matrix is correct, when NaN present in the dataframe.""" + expected_correlation_matrix = np.array( + [ + [1.333333, -0.333333, -1.333333, 0.666667], + [-0.333333, 0.25, 0, -0.166667], + [-1.333333, 0, 1.333333, -0.666667], + [0.666667, -0.166667, -0.666667, 0.333333], + ] + ) + output_matrix = covariance_matrix(data=DF_WITH_NAN) + np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) + + +def test_covariance_matrix(): + """Test that returned covariance matrix is correct.""" + expected_covariance_matrix = np.array( + [ + [1.333333, -0.333333, -1.000000, 0.666667], + [-0.333333, 0.250000, 0.083333, -0.166667], + [-1.000000, 0.083333, 0.916667, -0.500000], + [0.666667, -0.166667, -0.500000, 0.333333], + ] + ) + output_matrix = covariance_matrix(data=DF) + np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix) + + +def test_covariance_matrix_negative_min_periods(): + """Test that negative min_periods value raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + covariance_matrix(data=DF, min_periods=-1) + + +def test_invalid_ddof(): + """Test that invalid delta degrees of freedom raises the correct exception.""" + with pytest.raises(InvalidParameterValueException): + covariance_matrix(data=DF, delta_degrees_of_freedom=-1) diff --git a/tests/exploratory_analyses/statistical_tests_test.py b/tests/exploratory_analyses/statistical_tests_test.py deleted file mode 100644 index 03c68d7e..00000000 --- a/tests/exploratory_analyses/statistical_tests_test.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np -import pandas as pd -import pytest -from beartype.roar import BeartypeCallHintParamViolation - -from eis_toolkit.exceptions import InvalidParameterValueException, NonNumericDataException -from eis_toolkit.exploratory_analyses.statistical_tests import chi_square_test, correlation_matrix, covariance_matrix - -data = np.array([[0, 1, 2, 1], [2, 0, 1, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) -missing_data = np.array([[0, 1, 2, 1], [2, 0, np.nan, 2], [2, 1, 0, 2], [0, 1, 2, 1]]) -non_numeric_data = np.array([[0, 1, 2, 1], ["a", "b", "c", "d"], [3, 2, 1, 0], ["c", "d", "b", "a"]]) -numeric_data = pd.DataFrame(data, columns=["a", "b", "c", "d"]) -non_numeric_df = pd.DataFrame(non_numeric_data, columns=["a", "b", "c", "d"]) -missing_values_df = pd.DataFrame(missing_data, columns=["a", "b", "c", "d"]) -categorical_data = pd.DataFrame({"e": [0, 0, 1, 1], "f": [True, False, True, True]}) -target_column = "e" -np.random.seed(42) -large_data = np.random.normal(size=5001) -large_df = pd.DataFrame(large_data, columns=["a"]) - - -def test_chi_square_test(): - """Test that returned statistics for independence are correct.""" - output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=["f"]) - np.testing.assert_array_equal(list(output_statistics["f"].values()), [0.0, 1.0, 1]) - - -def test_correlation_matrix_nan(): - """Test that returned correlation matrix is correct, when NaN present in the dataframe.""" - expected_correlation_matrix = np.array( - [ - [1.000000, -0.577350, -1.000000, 1.000000], - [-0.577350, 1.000000, np.nan, -0.577350], - [-1.000000, np.nan, 1.000000, -1.000000], - [1.000000, -0.577350, -1.000000, 1.000000], - ] - ) - output_matrix = correlation_matrix(data=missing_values_df) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_correlation_matrix(): - """Test that returned correlation matrix is correct.""" - expected_correlation_matrix = np.array( - [ - [1.000000, -0.577350, -0.904534, 1.000000], - [-0.577350, 1.000000, 0.174078, -0.577350], - [-0.904534, 0.174078, 1.000000, -0.904534], - [1.000000, -0.577350, -0.904534, 1.000000], - ] - ) - output_matrix = correlation_matrix(data=numeric_data) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_correlation_matrix_non_numeric(): - """Test that returned correlation matrix is correct.""" - with pytest.raises(NonNumericDataException): - correlation_matrix(data=non_numeric_df, columns=["a", "b"]) - - -def test_covariance_matrix_nan(): - """Test that returned covariance matrix is correct, when NaN present in the dataframe.""" - expected_correlation_matrix = np.array( - [ - [1.333333, -0.333333, -1.333333, 0.666667], - [-0.333333, 0.25, 0, -0.166667], - [-1.333333, 0, 1.333333, -0.666667], - [0.666667, -0.166667, -0.666667, 0.333333], - ] - ) - output_matrix = covariance_matrix(data=missing_values_df) - np.testing.assert_array_almost_equal(output_matrix, expected_correlation_matrix) - - -def test_covariance_matrix(): - """Test that returned covariance matrix is correct.""" - expected_covariance_matrix = np.array( - [ - [1.333333, -0.333333, -1.000000, 0.666667], - [-0.333333, 0.250000, 0.083333, -0.166667], - [-1.000000, 0.083333, 0.916667, -0.500000], - [0.666667, -0.166667, -0.500000, 0.333333], - ] - ) - output_matrix = covariance_matrix(data=numeric_data) - np.testing.assert_array_almost_equal(output_matrix, expected_covariance_matrix) - - -def test_covariance_matrix_negative_min_periods(): - """Test that negative min_periods value raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - covariance_matrix(data=numeric_data, min_periods=-1) - - -def test_invalid_target_column(): - """Test that invalid target column raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - chi_square_test(data=categorical_data, target_column="invalid_column") - - -def test_invalid_correlation_method(): - """Test that invalid correlation method raises the correct exception.""" - with pytest.raises(BeartypeCallHintParamViolation): - correlation_matrix(data=numeric_data, correlation_method="invalid_method") - - -def test_min_periods_with_kendall(): - """Test that min_periods with correlation_method 'kendall' raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - correlation_matrix(data=numeric_data, correlation_method="kendall", min_periods=1) - - -def test_invalid_ddof(): - """Test that invalid delta degrees of freedom raises the correct exception.""" - with pytest.raises(InvalidParameterValueException): - covariance_matrix(data=numeric_data, delta_degrees_of_freedom=-1) From 6093734fd3b2e4c181cbd59ed0a998c45e0fe8d7 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Thu, 29 Feb 2024 09:37:28 +0200 Subject: [PATCH 07/10] Add docs --- docs/exploratory_analyses/chi_square_test.md | 3 +++ docs/exploratory_analyses/correlation_matrix.md | 0 docs/exploratory_analyses/covariance_matrix.md | 3 +++ docs/exploratory_analyses/statistical_testing.md | 3 --- 4 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 docs/exploratory_analyses/chi_square_test.md create mode 100644 docs/exploratory_analyses/correlation_matrix.md create mode 100644 docs/exploratory_analyses/covariance_matrix.md delete mode 100644 docs/exploratory_analyses/statistical_testing.md diff --git a/docs/exploratory_analyses/chi_square_test.md b/docs/exploratory_analyses/chi_square_test.md new file mode 100644 index 00000000..52e00339 --- /dev/null +++ b/docs/exploratory_analyses/chi_square_test.md @@ -0,0 +1,3 @@ +# Chi-square test + +::: eis_toolkit.exploratory_analyses.chi_square_test diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/exploratory_analyses/covariance_matrix.md b/docs/exploratory_analyses/covariance_matrix.md new file mode 100644 index 00000000..12385763 --- /dev/null +++ b/docs/exploratory_analyses/covariance_matrix.md @@ -0,0 +1,3 @@ +# Covariance matrix + +::: eis_toolkit.exploratory_analyses.covariance_matrix diff --git a/docs/exploratory_analyses/statistical_testing.md b/docs/exploratory_analyses/statistical_testing.md deleted file mode 100644 index 04df277a..00000000 --- a/docs/exploratory_analyses/statistical_testing.md +++ /dev/null @@ -1,3 +0,0 @@ -# Statistical (hypothesis) testing - -::: eis_toolkit.exploratory_analyses.statistical_tests From 68e00628af14ed426e61ad378a02f7521d0a1403 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Thu, 29 Feb 2024 09:41:21 +0200 Subject: [PATCH 08/10] minor update --- docs/exploratory_analyses/correlation_matrix.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/exploratory_analyses/correlation_matrix.md b/docs/exploratory_analyses/correlation_matrix.md index e69de29b..b86109e1 100644 --- a/docs/exploratory_analyses/correlation_matrix.md +++ b/docs/exploratory_analyses/correlation_matrix.md @@ -0,0 +1,3 @@ +# Correlation matrix + +::: eis_toolkit.exploratory_analyses.correlation_matrix From 19c7229bfecbf99078ffc5079c431f079c37a132 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Mon, 4 Mar 2024 13:45:50 +0200 Subject: [PATCH 09/10] fix: normality test filters numeric columns instead of error, fixed case where CLI function does not compute anything if columns param is left empty --- eis_toolkit/cli.py | 2 +- eis_toolkit/exploratory_analyses/normality_test.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index ad070e98..7c42f3b3 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -293,7 +293,7 @@ def normality_test_raster_cli(input_raster: Annotated[Path, INPUT_FILE_OPTION], with rasterio.open(input_raster) as raster: data = raster.read() typer.echo("Progress: 25%") - + print(bands) if len(bands) == 0: bands = None results_dict = normality_test_array(data=data, bands=bands, nodata_value=raster.nodata) diff --git a/eis_toolkit/exploratory_analyses/normality_test.py b/eis_toolkit/exploratory_analyses/normality_test.py index 8e8fcad0..26d0fb79 100644 --- a/eis_toolkit/exploratory_analyses/normality_test.py +++ b/eis_toolkit/exploratory_analyses/normality_test.py @@ -36,13 +36,13 @@ def normality_test_dataframe( Raises: EmptyDataException: The input data is empty. InvalidColumnException: All selected columns were not found in the input data. - NonNumericDataException: Selected data or columns contains non-numeric data. + NonNumericDataException: Selected columns contain non-numeric data or no numeric columns were found. SampleSizeExceededException: Input data exceeds the maximum of 5000 samples. """ if check_empty_dataframe(data): raise EmptyDataException("The input Dataframe is empty.") - if columns is not None: + if columns is not None and columns != []: if not check_columns_valid(data, columns): raise InvalidColumnException("All selected columns were not found in the input DataFrame.") if not check_columns_numeric(data, columns): @@ -51,9 +51,9 @@ def normality_test_dataframe( data = data[columns].dropna() else: - if not check_columns_numeric(data, data.columns): - raise NonNumericDataException("The input data contain non-numeric data.") - columns = data.columns + columns = data.select_dtypes(include=[np.number]).columns + if len(columns) == 0: + raise NonNumericDataException("No numeric columns were found.") statistics = {} for column in columns: From 6782cdf52c3ea47bde6f7cca9cfde460e512fc02 Mon Sep 17 00:00:00 2001 From: Niko Aarnio Date: Mon, 4 Mar 2024 14:13:45 +0200 Subject: [PATCH 10/10] modify(cli): improved file type aliases, changed all enum/literal params to be case insensitive --- eis_toolkit/cli.py | 355 ++++++++++++++++++++++----------------------- 1 file changed, 177 insertions(+), 178 deletions(-) diff --git a/eis_toolkit/cli.py b/eis_toolkit/cli.py index ae78b62d..873c1dba 100644 --- a/eis_toolkit/cli.py +++ b/eis_toolkit/cli.py @@ -232,14 +232,17 @@ class LocalMoranWeightType(str, Enum): # TODO: Check this and output file option -INPUT_FILE_OPTION = typer.Option( - exists=True, - file_okay=True, - dir_okay=False, - writable=False, - readable=True, - resolve_path=True, -) +INPUT_FILE_OPTION = Annotated[ + Path, + typer.Option( + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + ), +] INPUT_FILES_ARGUMENT = Annotated[ List[Path], @@ -255,21 +258,27 @@ class LocalMoranWeightType(str, Enum): ), ] -OUTPUT_FILE_OPTION = typer.Option( - file_okay=True, - dir_okay=False, - writable=True, - readable=True, - resolve_path=True, -) +OUTPUT_FILE_OPTION = Annotated[ + Path, + typer.Option( + file_okay=True, + dir_okay=False, + writable=True, + readable=True, + resolve_path=True, + ), +] -OUTPUT_DIR_OPTION = typer.Option( - file_okay=False, - dir_okay=True, - writable=True, - readable=True, - resolve_path=True, -) +OUTPUT_DIR_OPTION = Annotated[ + Path, + typer.Option( + file_okay=False, + dir_okay=True, + writable=True, + readable=True, + resolve_path=True, + ), +] # --- EXPLORATORY ANALYSES --- @@ -278,8 +287,8 @@ class LocalMoranWeightType(str, Enum): # DBSCAN @app.command() def dbscan_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, max_distance: float = 0.5, min_samples: int = 5, ): @@ -303,10 +312,10 @@ def dbscan_cli( # K-MEANS CLUSTERING @app.command() def k_means_clustering_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, number_of_clusters: Optional[int] = None, - random_state: int = None, # NOTE: Check typing + random_state: int = None, ): """Perform k-means clustering on the input data.""" from eis_toolkit.exploratory_analyses.k_means_cluster import k_means_clustering @@ -330,8 +339,8 @@ def k_means_clustering_cli( # PARALLEL COORDINATES @app.command() def parallel_coordinates_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_file: Optional[Annotated[Path, OUTPUT_FILE_OPTION]] = None, + input_vector: INPUT_FILE_OPTION, + output_file: Optional[OUTPUT_FILE_OPTION] = None, color_column_name: str = typer.Option(), plot_title: Optional[str] = None, palette_name: Optional[str] = None, @@ -374,10 +383,10 @@ def parallel_coordinates_cli( @app.command() def compute_pca_raster_cli( input_rasters: INPUT_FILES_ARGUMENT, - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + output_raster: OUTPUT_FILE_OPTION, number_of_components: int = typer.Option(), # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func - nodata_handling: NodataHandling = NodataHandling.remove, + nodata_handling: NodataHandling = typer.Option(NodataHandling.remove, case_sensitive=False), # NOTE: Omitted nodata parameter. Should use raster nodata. ): """Compute defined number of principal components for raster data.""" @@ -419,12 +428,12 @@ def compute_pca_raster_cli( # PCA FOR VECTOR DATA @app.command() def compute_pca_vector_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, number_of_components: int = typer.Option(), columns: Annotated[List[str], typer.Option()] = None, # NOTE: Omitted scaler type selection here since the parameter might be deleted from PCA func - nodata_handling: NodataHandling = NodataHandling.remove, + nodata_handling: NodataHandling = typer.Option(NodataHandling.remove, case_sensitive=False), nodata: float = None, ): """Compute defined number of principal components for vector data.""" @@ -458,7 +467,7 @@ def compute_pca_vector_cli( # DESCRIPTIVE STATISTICS (RASTER) @app.command() -def descriptive_statistics_raster_cli(input_file: Annotated[Path, INPUT_FILE_OPTION]): +def descriptive_statistics_raster_cli(input_file: INPUT_FILE_OPTION): """Generate descriptive statistics from raster data.""" from eis_toolkit.exploratory_analyses.descriptive_statistics import descriptive_statistics_raster @@ -477,7 +486,7 @@ def descriptive_statistics_raster_cli(input_file: Annotated[Path, INPUT_FILE_OPT # DESCRIPTIVE STATISTICS (VECTOR) @app.command() -def descriptive_statistics_vector_cli(input_file: Annotated[Path, INPUT_FILE_OPTION], column: str = None): +def descriptive_statistics_vector_cli(input_file: INPUT_FILE_OPTION, column: str = None): """Generate descriptive statistics from vector or tabular data.""" from eis_toolkit.exploratory_analyses.descriptive_statistics import descriptive_statistics_dataframe @@ -507,10 +516,10 @@ def descriptive_statistics_vector_cli(input_file: Annotated[Path, INPUT_FILE_OPT # LOCAL MORAN'S I @app.command() def local_morans_i_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, column: str = typer.Option(), - weight_type: LocalMoranWeightType = LocalMoranWeightType.queen, + weight_type: LocalMoranWeightType = typer.Option(LocalMoranWeightType.queen, case_sensitive=False), k: int = 4, permutations: int = 999, ): @@ -561,9 +570,9 @@ def check_raster_grids_cli(input_rasters: INPUT_FILES_ARGUMENT, same_extent: boo # CLIP RASTER @app.command() def clip_raster_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - geometries: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + geometries: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, ): """Clip the input raster with geometries in a geodataframe.""" from eis_toolkit.raster_processing.clipping import clip_raster @@ -590,9 +599,9 @@ def clip_raster_cli( # CREATE CONSTANT RASTER @app.command() def create_constant_raster_cli( - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + output_raster: OUTPUT_FILE_OPTION, constant_value: float = typer.Option(), - template_raster: Annotated[Path, INPUT_FILE_OPTION] = None, + template_raster: INPUT_FILE_OPTION = None, coord_west: float = None, coord_north: float = None, coord_east: float = None, @@ -659,9 +668,9 @@ def create_constant_raster_cli( # EXTRACT VALUES FROM RASTER @app.command() def extract_values_from_raster_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - geometries: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + geometries: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, ): """Extract raster values using point data to a DataFrame.""" from eis_toolkit.raster_processing.extract_values_from_raster import extract_values_from_raster @@ -684,10 +693,10 @@ def extract_values_from_raster_cli( # REPROJECT RASTER @app.command() def reproject_raster_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, target_crs: int = typer.Option(help="crs help"), - resampling_method: ResamplingMethods = typer.Option(default=ResamplingMethods.nearest), + resampling_method: ResamplingMethods = typer.Option(default=ResamplingMethods.nearest, case_sensitive=False), ): """Reproject the input raster to given CRS.""" from eis_toolkit.raster_processing.reprojecting import reproject_raster @@ -710,10 +719,10 @@ def reproject_raster_cli( # RESAMPLE RASTER @app.command() def resample_raster_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, resolution: float = typer.Option(), - resampling_method: ResamplingMethods = typer.Option(default=ResamplingMethods.bilinear), + resampling_method: ResamplingMethods = typer.Option(default=ResamplingMethods.bilinear, case_sensitive=False), ): """Resamples raster according to given resolution.""" from eis_toolkit.raster_processing.resampling import resample @@ -736,9 +745,9 @@ def resample_raster_cli( # SNAP RASTER @app.command() def snap_raster_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - snap_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + snap_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, ): """Snaps/aligns input raster to the given snap raster.""" from eis_toolkit.raster_processing.snapping import snap_with_raster @@ -761,9 +770,9 @@ def snap_raster_cli( @app.command() def unify_rasters_cli( rasters_to_unify: INPUT_FILES_ARGUMENT, - base_raster: Annotated[Path, INPUT_FILE_OPTION], + base_raster: INPUT_FILE_OPTION, output_directory: Annotated[Path, OUTPUT_DIR_OPTION], - resampling_method: ResamplingMethods = typer.Option(default=ResamplingMethods.nearest), + resampling_method: ResamplingMethods = typer.Option(default=ResamplingMethods.nearest, case_sensitive=False), same_extent: bool = False, ): """Unify rasters to match the base raster.""" @@ -803,7 +812,7 @@ def unify_rasters_cli( @app.command() def unique_combinations_cli( input_rasters: INPUT_FILES_ARGUMENT, - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + output_raster: OUTPUT_FILE_OPTION, ): """Get combinations of raster values between rasters.""" from eis_toolkit.raster_processing.unique_combinations import unique_combinations @@ -826,8 +835,8 @@ def unique_combinations_cli( # EXTRACT WINDOW @app.command() def extract_window_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, center_coords: Tuple[float, float] = typer.Option(), height: int = typer.Option(), width: int = typer.Option(), @@ -852,9 +861,9 @@ def extract_window_cli( # SURFACE DERIVATIVES - CLASSIFY ASPECT @app.command() def classify_aspect_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], - unit: AngleUnits = AngleUnits.radians, + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, + unit: AngleUnits = typer.Option(AngleUnits.radians, case_sensitive=False), num_classes: int = 8, ): """Classify an aspect raster data set.""" @@ -879,15 +888,15 @@ def classify_aspect_cli( # SURFACE DERIVATIVES @app.command() def surface_derivatives_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], - parameters: Annotated[List[SurfaceParameter], typer.Option()], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, + parameters: Annotated[List[SurfaceParameter], typer.Option(case_sensitive=False)], scaling_factor: Optional[float] = 1.0, slope_tolerance: Optional[float] = 0.0, - slope_gradient_unit: SlopeGradientUnit = SlopeGradientUnit.radians, - slope_direction_unit: AngleUnits = AngleUnits.radians, - first_order_method: FirstOrderMethod = FirstOrderMethod.Horn, - second_order_method: SecondOrderMethod = SecondOrderMethod.Young, + slope_gradient_unit: SlopeGradientUnit = typer.Option(SlopeGradientUnit.radians, case_sensitive=False), + slope_direction_unit: AngleUnits = typer.Option(AngleUnits.radians, case_sensitive=False), + first_order_method: FirstOrderMethod = typer.Option(FirstOrderMethod.Horn, case_sensitive=False), + second_order_method: SecondOrderMethod = typer.Option(SecondOrderMethod.Young, case_sensitive=False), ): """Calculate the first and/or second order surface attributes.""" from eis_toolkit.raster_processing.derivatives.parameters import first_order, second_order_basic_set @@ -945,8 +954,8 @@ def surface_derivatives_cli( @app.command() def reclassify_with_manual_breaks_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, breaks: Annotated[List[int], typer.Option()], bands: Annotated[List[int], typer.Option()] = None, ): @@ -969,8 +978,8 @@ def reclassify_with_manual_breaks_cli( @app.command() def reclassify_with_defined_intervals_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, interval_size: int = typer.Option(), bands: Annotated[List[int], typer.Option()] = None, ): @@ -993,8 +1002,8 @@ def reclassify_with_defined_intervals_cli( @app.command() def reclassify_with_equal_intervals_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, number_of_intervals: int = typer.Option(), bands: Annotated[List[int], typer.Option()] = None, ): @@ -1019,8 +1028,8 @@ def reclassify_with_equal_intervals_cli( @app.command() def reclassify_with_quantiles_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, number_of_quantiles: int = typer.Option(), bands: Annotated[List[int], typer.Option()] = None, ): @@ -1045,8 +1054,8 @@ def reclassify_with_quantiles_cli( @app.command() def reclassify_with_natural_breaks_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, number_of_classes: int = typer.Option(), bands: Annotated[List[int], typer.Option()] = None, ): @@ -1071,8 +1080,8 @@ def reclassify_with_natural_breaks_cli( @app.command() def reclassify_with_geometrical_intervals_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, number_of_classes: int = typer.Option(), bands: Annotated[List[int], typer.Option()] = None, ): @@ -1097,8 +1106,8 @@ def reclassify_with_geometrical_intervals_cli( @app.command() def reclassify_with_standard_deviation_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, number_of_intervals: int = typer.Option(), bands: Annotated[List[int], typer.Option()] = None, ): @@ -1126,9 +1135,7 @@ def reclassify_with_standard_deviation_cli( # CALCULATE GEOMETRY @app.command() -def calculate_geometry_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], output_vector: Annotated[Path, OUTPUT_FILE_OPTION] -): +def calculate_geometry_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION): """Calculate the length or area of the given geometries.""" from eis_toolkit.vector_processing.calculate_geometry import calculate_geometry @@ -1147,9 +1154,7 @@ def calculate_geometry_cli( # EXTRACT SHARED LINES @app.command() -def extract_shared_lines_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], output_vector: Annotated[Path, OUTPUT_FILE_OPTION] -): +def extract_shared_lines_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION): """Extract shared lines/borders/edges between polygons.""" from eis_toolkit.vector_processing.extract_shared_lines import extract_shared_lines @@ -1169,8 +1174,8 @@ def extract_shared_lines_cli( # IDW INTERPOLATION @app.command() def idw_interpolation_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, target_column: str = typer.Option(), resolution: float = typer.Option(), power: float = 2.0, @@ -1214,14 +1219,14 @@ def idw_interpolation_cli( # KRIGING INTERPOLATION @app.command() def kriging_interpolation_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, target_column: str = typer.Option(), resolution: float = typer.Option(), extent: Tuple[float, float, float, float] = (None, None, None, None), # TODO Change this - variogram_model: VariogramModel = VariogramModel.linear, - coordinates_type: CoordinatesType = CoordinatesType.geographic, - method: KrigingMethod = KrigingMethod.ordinary, + variogram_model: VariogramModel = typer.Option(VariogramModel.linear, case_sensitive=False), + coordinates_type: CoordinatesType = typer.Option(CoordinatesType.geographic, case_sensitive=False), + method: KrigingMethod = typer.Option(KrigingMethod.ordinary, case_sensitive=False), ): """Apply kriging interpolation to input vector file.""" from eis_toolkit.vector_processing.kriging_interpolation import kriging @@ -1263,15 +1268,15 @@ def kriging_interpolation_cli( # RASTERIZE @app.command() def rasterize_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, resolution: float = None, value_column: str = None, default_value: float = 1.0, fill_value: float = 0.0, - base_raster_profile_raster: Annotated[Path, INPUT_FILE_OPTION] = None, + base_raster_profile_raster: INPUT_FILE_OPTION = None, buffer_value: float = None, - merge_strategy: MergeStrategy = MergeStrategy.replace, + merge_strategy: MergeStrategy = typer.Option(MergeStrategy.replace, case_sensitive=False), ): """ Rasterize input vector. @@ -1321,8 +1326,8 @@ def rasterize_cli( # REPROJECT VECTOR @app.command() def reproject_vector_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, target_crs: int = typer.Option(help="crs help"), ): """Reproject the input vector to given CRS.""" @@ -1345,12 +1350,12 @@ def reproject_vector_cli( # VECTOR DENSITY @app.command() def vector_density_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, resolution: float = None, - base_raster_profile_raster: Annotated[Path, INPUT_FILE_OPTION] = None, + base_raster_profile_raster: INPUT_FILE_OPTION = None, buffer_value: float = None, - statistic: VectorDensityStatistic = VectorDensityStatistic.density, + statistic: VectorDensityStatistic = typer.Option(VectorDensityStatistic.density, case_sensitive=False), ): """ Compute density of geometries within raster. @@ -1397,9 +1402,9 @@ def vector_density_cli( # DISTANCE COMPUTATION @app.command() def distance_computation_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - geometries: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + geometries: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, ): """Calculate distance from raster cell to nearest geometry.""" from eis_toolkit.vector_processing.distance_computation import distance_computation @@ -1433,15 +1438,15 @@ def distance_computation_cli( @app.command() def logistic_regression_train_cli( input_rasters: INPUT_FILES_ARGUMENT, - target_labels: Annotated[Path, INPUT_FILE_OPTION], - output_file: Annotated[Path, OUTPUT_FILE_OPTION], - validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once), - validation_metric: ClassifierMetrics = typer.Option(default=ClassifierMetrics.accuracy), + target_labels: INPUT_FILE_OPTION, + output_file: OUTPUT_FILE_OPTION, + validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once, case_sensitive=False), + validation_metric: ClassifierMetrics = typer.Option(default=ClassifierMetrics.accuracy, case_sensitive=False), split_size: float = 0.2, cv_folds: int = 5, - penalty: LogisticRegressionPenalties = typer.Option(default=LogisticRegressionPenalties.l2), + penalty: LogisticRegressionPenalties = typer.Option(default=LogisticRegressionPenalties.l2, case_sensitive=False), max_iter: int = 100, - solver: LogisticRegressionSolvers = typer.Option(default=LogisticRegressionSolvers.lbfgs), + solver: LogisticRegressionSolvers = typer.Option(default=LogisticRegressionSolvers.lbfgs, case_sensitive=False), verbose: int = 0, random_state: Optional[int] = None, ): @@ -1485,10 +1490,10 @@ def logistic_regression_train_cli( @app.command() def random_forest_classifier_train_cli( input_rasters: INPUT_FILES_ARGUMENT, - target_labels: Annotated[Path, INPUT_FILE_OPTION], - output_file: Annotated[Path, OUTPUT_FILE_OPTION], - validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once), - validation_metric: ClassifierMetrics = typer.Option(default=ClassifierMetrics.accuracy), + target_labels: INPUT_FILE_OPTION, + output_file: OUTPUT_FILE_OPTION, + validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once, case_sensitive=False), + validation_metric: ClassifierMetrics = typer.Option(default=ClassifierMetrics.accuracy, case_sensitive=False), split_size: float = 0.2, cv_folds: int = 5, n_estimators: int = 100, @@ -1535,10 +1540,10 @@ def random_forest_classifier_train_cli( @app.command() def random_forest_regressor_train_cli( input_rasters: INPUT_FILES_ARGUMENT, - target_labels: Annotated[Path, INPUT_FILE_OPTION], - output_file: Annotated[Path, OUTPUT_FILE_OPTION], - validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once), - validation_metric: RegressorMetrics = typer.Option(default=RegressorMetrics.mse), + target_labels: INPUT_FILE_OPTION, + output_file: OUTPUT_FILE_OPTION, + validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once, case_sensitive=False), + validation_metric: RegressorMetrics = typer.Option(default=RegressorMetrics.mse, case_sensitive=False), split_size: float = 0.2, cv_folds: int = 5, n_estimators: int = 100, @@ -1585,10 +1590,10 @@ def random_forest_regressor_train_cli( @app.command() def gradient_boosting_classifier_train_cli( input_rasters: INPUT_FILES_ARGUMENT, - target_labels: Annotated[Path, INPUT_FILE_OPTION], - output_file: Annotated[Path, OUTPUT_FILE_OPTION], - validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once), - validation_metric: ClassifierMetrics = typer.Option(default=ClassifierMetrics.accuracy), + target_labels: INPUT_FILE_OPTION, + output_file: OUTPUT_FILE_OPTION, + validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once, case_sensitive=False), + validation_metric: ClassifierMetrics = typer.Option(default=ClassifierMetrics.accuracy, case_sensitive=False), split_size: float = 0.2, cv_folds: int = 5, loss: GradientBoostingClassifierLosses = typer.Option(default=GradientBoostingClassifierLosses.log_loss), @@ -1641,10 +1646,10 @@ def gradient_boosting_classifier_train_cli( @app.command() def gradient_boosting_regressor_train_cli( input_rasters: INPUT_FILES_ARGUMENT, - target_labels: Annotated[Path, INPUT_FILE_OPTION], - output_file: Annotated[Path, OUTPUT_FILE_OPTION], - validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once), - validation_metric: RegressorMetrics = typer.Option(default=RegressorMetrics.mse), + target_labels: INPUT_FILE_OPTION, + output_file: OUTPUT_FILE_OPTION, + validation_method: ValidationMethods = typer.Option(default=ValidationMethods.split_once, case_sensitive=False), + validation_metric: RegressorMetrics = typer.Option(default=RegressorMetrics.mse, case_sensitive=False), split_size: float = 0.2, cv_folds: int = 5, loss: GradientBoostingRegressorLosses = typer.Option(default=GradientBoostingRegressorLosses.squared_error), @@ -1697,9 +1702,9 @@ def gradient_boosting_regressor_train_cli( @app.command() def evaluate_trained_model_cli( input_rasters: INPUT_FILES_ARGUMENT, - target_labels: Annotated[Path, INPUT_FILE_OPTION], - model_file: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + target_labels: INPUT_FILE_OPTION, + model_file: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, validation_metric: str = typer.Option(), ): """Train and optionally validate a Gradient boosting regressor model using Sklearn.""" @@ -1740,8 +1745,8 @@ def evaluate_trained_model_cli( @app.command() def predict_with_trained_model_cli( input_rasters: INPUT_FILES_ARGUMENT, - model_file: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + model_file: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, ): """Train and optionally validate a Gradient boosting regressor model using Sklearn.""" from eis_toolkit.prediction.machine_learning_general import ( @@ -1779,7 +1784,7 @@ def predict_with_trained_model_cli( @app.command() def and_overlay_cli( input_rasters: INPUT_FILES_ARGUMENT, - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + output_raster: OUTPUT_FILE_OPTION, ): """Compute an 'and' overlay operation with fuzzy logic.""" from eis_toolkit.prediction.fuzzy_overlay import and_overlay @@ -1807,7 +1812,7 @@ def and_overlay_cli( @app.command() def or_overlay_cli( input_rasters: INPUT_FILES_ARGUMENT, - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + output_raster: OUTPUT_FILE_OPTION, ): """Compute an 'or' overlay operation with fuzzy logic.""" from eis_toolkit.prediction.fuzzy_overlay import or_overlay @@ -1835,7 +1840,7 @@ def or_overlay_cli( @app.command() def product_overlay_cli( input_rasters: INPUT_FILES_ARGUMENT, - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + output_raster: OUTPUT_FILE_OPTION, ): """Compute an 'product' overlay operation with fuzzy logic.""" from eis_toolkit.prediction.fuzzy_overlay import product_overlay @@ -1863,7 +1868,7 @@ def product_overlay_cli( @app.command() def sum_overlay_cli( input_rasters: INPUT_FILES_ARGUMENT, - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + output_raster: OUTPUT_FILE_OPTION, ): """Compute an 'sum' overlay operation with fuzzy logic.""" from eis_toolkit.prediction.fuzzy_overlay import sum_overlay @@ -1889,9 +1894,7 @@ def sum_overlay_cli( # GAMMA OVERLAY @app.command() -def gamma_overlay_cli( - input_rasters: INPUT_FILES_ARGUMENT, output_raster: Annotated[Path, OUTPUT_FILE_OPTION], gamma: float = 0.5 -): +def gamma_overlay_cli(input_rasters: INPUT_FILES_ARGUMENT, output_raster: OUTPUT_FILE_OPTION, gamma: float = 0.5): """Compute an 'gamma' overlay operation with fuzzy logic.""" from eis_toolkit.prediction.fuzzy_overlay import gamma_overlay from eis_toolkit.utilities.file_io import read_and_stack_rasters @@ -1924,8 +1927,8 @@ def gamma_overlay_cli( # CODA - ALR TRANSFORM @app.command() def alr_transform_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, column: str = None, keep_denominator_column: bool = False, ): @@ -1951,8 +1954,8 @@ def alr_transform_cli( # CODA - INVERSE ALR TRANSFORM @app.command() def inverse_alr_transform_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, denominator_column: str = typer.Option(), scale: float = 1.0, ): @@ -1977,9 +1980,7 @@ def inverse_alr_transform_cli( # CODA - CLR TRANSFORM @app.command() -def clr_transform_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], output_vector: Annotated[Path, OUTPUT_FILE_OPTION] -): +def clr_transform_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION): """Perform a centered logratio transformation on the data.""" from eis_toolkit.transformations.coda.clr import clr_transform @@ -2002,8 +2003,8 @@ def clr_transform_cli( # CODA - INVERSE CLR TRANSFORM @app.command() def inverse_clr_transform_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, colnames: Annotated[List[str], typer.Option()] = None, scale: float = 1.0, ): @@ -2029,8 +2030,8 @@ def inverse_clr_transform_cli( # CODA - SINGLE ILR TRANSFORM @app.command() def single_ilr_transform_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, subcomposition_1: Annotated[List[str], typer.Option()], subcomposition_2: Annotated[List[str], typer.Option()], ): @@ -2058,8 +2059,8 @@ def single_ilr_transform_cli( # CODA - PAIRWISE LOGRATIO TRANSFORM @app.command() def pairwise_logratio_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, numerator_column: str = typer.Option(), denominator_column: str = typer.Option(), ): @@ -2087,8 +2088,8 @@ def pairwise_logratio_cli( # CODA - SINGLE PLR TRANSFORM @app.command() def single_plr_transform_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], - output_vector: Annotated[Path, OUTPUT_FILE_OPTION], + input_vector: INPUT_FILE_OPTION, + output_vector: OUTPUT_FILE_OPTION, column: str = typer.Option(), ): """Perform a pivot logratio transformation on the selected column.""" @@ -2114,9 +2115,7 @@ def single_plr_transform_cli( # CODA - PLR TRANSFORM @app.command() -def plr_transform_cli( - input_vector: Annotated[Path, INPUT_FILE_OPTION], output_vector: Annotated[Path, OUTPUT_FILE_OPTION] -): +def plr_transform_cli(input_vector: INPUT_FILE_OPTION, output_vector: OUTPUT_FILE_OPTION): """Perform a pivot logratio transformation on the dataframe, returning the full set of transforms.""" from eis_toolkit.transformations.coda.plr import plr_transform @@ -2139,8 +2138,8 @@ def plr_transform_cli( # BINARIZE @app.command() def binarize_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, threshold: float = typer.Option(), ): """ @@ -2168,8 +2167,8 @@ def binarize_cli( # CLIP TRANSFORM @app.command() def clip_transform_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, limit_lower: Optional[float] = None, limit_higher: Optional[float] = None, ): @@ -2198,8 +2197,8 @@ def clip_transform_cli( # Z-SCORE NORMALIZATION @app.command() def z_score_normalization_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, ): """ Normalize data based on mean and standard deviation. @@ -2225,8 +2224,8 @@ def z_score_normalization_cli( # MIX_MAX SCALING @app.command() def min_max_scaling_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, min: float = 0.0, max: float = 1.0, ): @@ -2254,9 +2253,9 @@ def min_max_scaling_cli( # LOGARITHMIC @app.command() def log_transform_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], - log_type: LogarithmTransforms = LogarithmTransforms.log2, + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, + log_type: LogarithmTransforms = typer.Option(LogarithmTransforms.log2, case_sensitive=False), ): """ Perform a logarithmic transformation on the provided data. @@ -2283,8 +2282,8 @@ def log_transform_cli( # SIGMOID @app.command() def sigmoid_transform_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, limit_lower: float = 0.0, limit_upper: float = 1.0, slope: float = 1, @@ -2316,8 +2315,8 @@ def sigmoid_transform_cli( # WINSORIZE @app.command() def winsorize_transform_cli( - input_raster: Annotated[Path, INPUT_FILE_OPTION], - output_raster: Annotated[Path, OUTPUT_FILE_OPTION], + input_raster: INPUT_FILE_OPTION, + output_raster: OUTPUT_FILE_OPTION, percentile_lower: Optional[float] = None, percentile_higher: Optional[float] = None, inside: bool = False,