From 4258133d878f6d93beff654cddfc02a3a3bf8ca7 Mon Sep 17 00:00:00 2001 From: Olga Filippova <36808731+0lgaF@users.noreply.github.com> Date: Tue, 14 Nov 2023 19:28:11 +0300 Subject: [PATCH] Add feature importance (#852) * Add feature importance * Linters * add spark support --------- Co-authored-by: 0lgaF Co-authored-by: mike0sv --- .../metrics/data_drift/data_drift_table.py | 152 +++++++++++------- .../metrics/data_drift/feature_importance.py | 83 ++++++++++ src/evidently/spark/__init__.py | 3 +- .../spark/metrics/feature_importance.py | 38 +++++ .../data_drift/test_data_drift_table.py | 4 + tests/multitest/metrics/data_drift.py | 8 +- 6 files changed, 228 insertions(+), 60 deletions(-) create mode 100644 src/evidently/metrics/data_drift/feature_importance.py create mode 100644 src/evidently/spark/metrics/feature_importance.py diff --git a/src/evidently/metrics/data_drift/data_drift_table.py b/src/evidently/metrics/data_drift/data_drift_table.py index 50843f2264..15fe3d81a1 100644 --- a/src/evidently/metrics/data_drift/data_drift_table.py +++ b/src/evidently/metrics/data_drift/data_drift_table.py @@ -10,6 +10,7 @@ from evidently.metric_results import DatasetColumns from evidently.metric_results import HistogramData from evidently.metrics.data_drift.base import WithDriftOptions +from evidently.metrics.data_drift.feature_importance import FeatureImportanceMetric from evidently.model.widget import BaseWidgetInfo from evidently.options.base import AnyOptions from evidently.options.data_drift import DataDriftOptions @@ -39,10 +40,14 @@ class Config: dataset_drift: bool drift_by_columns: Dict[str, ColumnDataDriftMetrics] dataset_columns: DatasetColumns + current_fi: Optional[Dict[str, float]] = None + reference_fi: Optional[Dict[str, float]] = None class DataDriftTable(WithDriftOptions[DataDriftTableResults]): columns: Optional[List[str]] + feature_importance: Optional[bool] + _feature_importance_metric: Optional[FeatureImportanceMetric] def __init__( self, @@ -58,6 +63,7 @@ def __init__( text_stattest_threshold: Optional[float] = None, per_column_stattest_threshold: Optional[Dict[str, float]] = None, options: AnyOptions = None, + feature_importance: Optional[bool] = False, ): self.columns = columns super().__init__( @@ -72,6 +78,7 @@ def __init__( text_stattest_threshold=text_stattest_threshold, per_column_stattest_threshold=per_column_stattest_threshold, options=options, + feature_importance=feature_importance, ) self._drift_options = DataDriftOptions( all_features_stattest=stattest, @@ -85,6 +92,10 @@ def __init__( text_features_threshold=text_stattest_threshold, per_feature_threshold=per_column_stattest_threshold, ) + if feature_importance: + self._feature_importance_metric = FeatureImportanceMetric() + else: + self._feature_importance_metric = None def get_parameters(self) -> tuple: return None if self.columns is None else tuple(self.columns), self.drift_options @@ -107,6 +118,14 @@ def calculate(self, data: InputData) -> DataDriftTableResults: columns=self.columns, agg_data=agg_data, ) + current_fi: Optional[Dict[str, float]] = None + reference_fi: Optional[Dict[str, float]] = None + + if self._feature_importance_metric is not None: + res = self._feature_importance_metric.get_result() + current_fi = res.current + reference_fi = res.reference + return DataDriftTableResults( number_of_columns=result.number_of_columns, number_of_drifted_columns=result.number_of_drifted_columns, @@ -114,13 +133,20 @@ def calculate(self, data: InputData) -> DataDriftTableResults: dataset_drift=result.dataset_drift, drift_by_columns=result.drift_by_columns, dataset_columns=result.dataset_columns, + current_fi=current_fi, + reference_fi=reference_fi, ) @default_renderer(wrap_type=DataDriftTable) class DataDriftTableRenderer(MetricRenderer): def _generate_column_params( - self, column_name: str, data: ColumnDataDriftMetrics, agg_data: bool + self, + column_name: str, + data: ColumnDataDriftMetrics, + agg_data: bool, + current_fi: Optional[Dict[str, float]] = None, + reference_fi: Optional[Dict[str, float]] = None, ) -> Optional[RichTableDataRow]: details = RowDetails() if data.column_type == "text": @@ -157,18 +183,13 @@ def _generate_column_params( data_drift = "Detected" if data.drift_detected else "Not Detected" - return RichTableDataRow( - details=details, - fields={ - "column_name": column_name, - "column_type": data.column_type, - "stattest_name": data.stattest_name, - # "reference_distribution": {}, - # "current_distribution": {}, - "data_drift": data_drift, - "drift_score": round(data.drift_score, 6), - }, - ) + fields = { + "column_name": column_name, + "column_type": data.column_type, + "stattest_name": data.stattest_name, + "data_drift": data_drift, + "drift_score": round(data.drift_score, 6), + } else: if ( @@ -221,24 +242,26 @@ def _generate_column_params( ) distribution = plotly_figure(title="", figure=fig) details.with_part("DATA DISTRIBUTION", info=distribution) - return RichTableDataRow( - details=details, - fields={ - "column_name": column_name, - "column_type": data.column_type, - "stattest_name": data.stattest_name, - "reference_distribution": { - "x": list(ref_small_hist.x), - "y": list(ref_small_hist.y), - }, - "current_distribution": { - "x": list(current_small_hist.x), - "y": list(current_small_hist.y), - }, - "data_drift": data_drift, - "drift_score": round(data.drift_score, 6), + fields = { + "column_name": column_name, + "column_type": data.column_type, + "stattest_name": data.stattest_name, + "reference_distribution": { + "x": list(ref_small_hist.x), + "y": list(ref_small_hist.y), }, - ) + "current_distribution": { + "x": list(current_small_hist.x), + "y": list(current_small_hist.y), + }, + "data_drift": data_drift, + "drift_score": round(data.drift_score, 6), + } + if current_fi is not None: + fields["current_feature_importance"] = current_fi.get(column_name, "") + if reference_fi is not None: + fields["reference_feature_importance"] = reference_fi.get(column_name, "") + return RichTableDataRow(details=details, fields=fields) def render_html(self, obj: DataDriftTable) -> List[BaseWidgetInfo]: results = obj.get_result() @@ -268,45 +291,58 @@ def render_html(self, obj: DataDriftTable) -> List[BaseWidgetInfo]: columns = columns + all_columns for column_name in columns: - column_params = self._generate_column_params(column_name, results.drift_by_columns[column_name], agg_data) + column_params = self._generate_column_params( + column_name, + results.drift_by_columns[column_name], + agg_data, + results.current_fi, + results.reference_fi, + ) if column_params is not None: params_data.append(column_params) drift_percents = round(results.share_of_drifted_columns * 100, 3) + table_columns = [ + ColumnDefinition("Column", "column_name"), + ColumnDefinition("Type", "column_type"), + ] + if results.current_fi is not None: + table_columns.append(ColumnDefinition("Current feature importance", "current_feature_importance")) + if results.reference_fi is not None: + table_columns.append(ColumnDefinition("Reference feature importance", "reference_feature_importance")) + table_columns = table_columns + [ + ColumnDefinition( + "Reference Distribution", + "reference_distribution", + ColumnType.HISTOGRAM, + options={ + "xField": "x", + "yField": "y", + "color": color_options.primary_color, + }, + ), + ColumnDefinition( + "Current Distribution", + "current_distribution", + ColumnType.HISTOGRAM, + options={ + "xField": "x", + "yField": "y", + "color": color_options.primary_color, + }, + ), + ColumnDefinition("Data Drift", "data_drift"), + ColumnDefinition("Stat Test", "stattest_name"), + ColumnDefinition("Drift Score", "drift_score"), + ] return [ header_text(label="Data Drift Summary"), rich_table_data( title=f"Drift is detected for {drift_percents}% of columns " f"({results.number_of_drifted_columns} out of {results.number_of_columns}).", - columns=[ - ColumnDefinition("Column", "column_name"), - ColumnDefinition("Type", "column_type"), - ColumnDefinition( - "Reference Distribution", - "reference_distribution", - ColumnType.HISTOGRAM, - options={ - "xField": "x", - "yField": "y", - "color": color_options.primary_color, - }, - ), - ColumnDefinition( - "Current Distribution", - "current_distribution", - ColumnType.HISTOGRAM, - options={ - "xField": "x", - "yField": "y", - "color": color_options.primary_color, - }, - ), - ColumnDefinition("Data Drift", "data_drift"), - ColumnDefinition("Stat Test", "stattest_name"), - ColumnDefinition("Drift Score", "drift_score"), - ], + columns=table_columns, data=params_data, ), ] diff --git a/src/evidently/metrics/data_drift/feature_importance.py b/src/evidently/metrics/data_drift/feature_importance.py new file mode 100644 index 0000000000..ac3e088536 --- /dev/null +++ b/src/evidently/metrics/data_drift/feature_importance.py @@ -0,0 +1,83 @@ +from typing import Dict +from typing import List +from typing import Optional + +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestRegressor +from sklearn.preprocessing import OrdinalEncoder + +from evidently.base_metric import InputData +from evidently.base_metric import Metric +from evidently.base_metric import MetricResult +from evidently.core import ColumnType +from evidently.model.widget import BaseWidgetInfo +from evidently.renderers.base_renderer import MetricRenderer +from evidently.renderers.base_renderer import default_renderer +from evidently.utils.data_preprocessing import DataDefinition + +SAMPLE_SIZE = 5000 + + +class FeatureImportanceMetricResult(MetricResult): + current: Optional[Dict[str, float]] = None + reference: Optional[Dict[str, float]] = None + + +class FeatureImportanceMetric(Metric[FeatureImportanceMetricResult]): + def calculate(self, data: InputData) -> FeatureImportanceMetricResult: + if data.additional_datasets.get("current_feature_importance") is not None: + return FeatureImportanceMetricResult( + current=data.additional_datasets.get("current_feature_importance"), + reference=data.additional_datasets.get("reference_feature_importance"), + ) + + curr_sampled_data = data.current_data.sample(min(SAMPLE_SIZE, data.current_data.shape[0]), random_state=0) + ref_sampled_data: Optional[pd.DataFrame] = None + if data.reference_data is not None: + ref_sampled_data = data.reference_data.sample( + min(SAMPLE_SIZE, data.reference_data.shape[0]), random_state=0 + ) + + return get_feature_importance_from_samples(data.data_definition, curr_sampled_data, ref_sampled_data) + + +def get_feature_importance_from_samples( + data_definition: DataDefinition, curr_sampled_data: pd.DataFrame, ref_sampled_data: Optional[pd.DataFrame] +): + num_cols = data_definition.get_columns(filter_def=ColumnType.Numerical, features_only=True) + cat_cols = data_definition.get_columns(filter_def=ColumnType.Categorical, features_only=True) + + columns = [x.column_name for x in num_cols] + [x.column_name for x in cat_cols] + + for col in [x.column_name for x in cat_cols]: + enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) + curr_sampled_data[col] = enc.fit_transform(curr_sampled_data[col].values.reshape(-1, 1)) + if ref_sampled_data is not None: + ref_sampled_data[col] = enc.fit_transform(ref_sampled_data[col].values.reshape(-1, 1)) + + task = data_definition.task + target_column = data_definition.get_target_column() + if target_column is None: + return FeatureImportanceMetricResult(current=None, reference=None) + target_name = target_column.column_name + if task == "regression": + model = RandomForestRegressor(min_samples_leaf=10) + else: + model = RandomForestClassifier(min_samples_leaf=10) + + model.fit(curr_sampled_data[columns], curr_sampled_data[target_name]) + current_fi = {x: np.round(y, 3) for x, y in zip(columns, model.feature_importances_)} + + reference_fi: Optional[Dict[str, float]] = None + if ref_sampled_data is not None: + model.fit(ref_sampled_data[columns], ref_sampled_data[target_name]) + reference_fi = {x: np.round(y, 3) for x, y in zip(columns, model.feature_importances_)} + return FeatureImportanceMetricResult(current=current_fi, reference=reference_fi) + + +@default_renderer(wrap_type=FeatureImportanceMetric) +class FeatureImportanceRenderer(MetricRenderer): + def render_html(self, obj: FeatureImportanceMetric) -> List[BaseWidgetInfo]: + return [] diff --git a/src/evidently/spark/__init__.py b/src/evidently/spark/__init__.py index 50e23d2ccf..ff12fa1a68 100644 --- a/src/evidently/spark/__init__.py +++ b/src/evidently/spark/__init__.py @@ -1,4 +1,5 @@ from .engine import SparkEngine from .metrics import data_drift +from .metrics import feature_importance -__all__ = ["SparkEngine", "data_drift"] +__all__ = ["SparkEngine", "data_drift", "feature_importance"] diff --git a/src/evidently/spark/metrics/feature_importance.py b/src/evidently/spark/metrics/feature_importance.py new file mode 100644 index 0000000000..e7f6333745 --- /dev/null +++ b/src/evidently/spark/metrics/feature_importance.py @@ -0,0 +1,38 @@ +from typing import Optional + +import pandas as pd + +from evidently.calculation_engine.engine import metric_implementation +from evidently.metrics.data_drift.feature_importance import SAMPLE_SIZE +from evidently.metrics.data_drift.feature_importance import FeatureImportanceMetric +from evidently.metrics.data_drift.feature_importance import FeatureImportanceMetricResult +from evidently.metrics.data_drift.feature_importance import get_feature_importance_from_samples +from evidently.spark.engine import SparkInputData +from evidently.spark.engine import SparkMetricImplementation + + +@metric_implementation(FeatureImportanceMetric) +class SparkFeatureImportanceMetric(SparkMetricImplementation[FeatureImportanceMetric]): + def calculate(self, context, data: SparkInputData) -> FeatureImportanceMetricResult: + if data.additional_datasets.get("current_feature_importance") is not None: + return FeatureImportanceMetricResult( + current=data.additional_datasets.get("current_feature_importance"), + reference=data.additional_datasets.get("reference_feature_importance"), + ) + + cur_count = data.current_data.count() + curr_sampled_data: pd.DataFrame = ( + data.current_data.toPandas() + if cur_count < SAMPLE_SIZE + else data.current_data.sample(cur_count / SAMPLE_SIZE, seed=0).toPandas() + ) + ref_sampled_data: Optional[pd.DataFrame] = None + if data.reference_data is not None: + ref_count = data.reference_data.count() + ref_sampled_data = ( + data.reference_data.toPandas() + if ref_count < SAMPLE_SIZE + else data.reference_data.sample(ref_count / SAMPLE_SIZE, seed=0).toPandas() + ) + + return get_feature_importance_from_samples(data.data_definition, curr_sampled_data, ref_sampled_data) diff --git a/tests/metrics/data_drift/test_data_drift_table.py b/tests/metrics/data_drift/test_data_drift_table.py index 4f59b88358..8fce303b97 100644 --- a/tests/metrics/data_drift/test_data_drift_table.py +++ b/tests/metrics/data_drift/test_data_drift_table.py @@ -152,6 +152,7 @@ def test_data_drift_metrics_with_options() -> None: result = json.loads(result_json) assert result["metrics"][0]["metric"] == "DataDriftTable" assert result["metrics"][0]["result"] == { + "current_fi": None, "dataset_drift": False, "drift_by_columns": { "category_feature": { @@ -187,6 +188,7 @@ def test_data_drift_metrics_with_options() -> None: }, "number_of_columns": 3, "number_of_drifted_columns": 1, + "reference_fi": None, "share_of_drifted_columns": 0.3333333333333333, } @@ -212,6 +214,7 @@ def test_data_drift_metrics_json_output() -> None: result = json.loads(result_json) assert result["metrics"][0]["metric"] == "DataDriftTable" assert result["metrics"][0]["result"] == { + "current_fi": None, "dataset_drift": True, "drift_by_columns": { "category_feature": { @@ -247,5 +250,6 @@ def test_data_drift_metrics_json_output() -> None: }, "number_of_columns": 3, "number_of_drifted_columns": 3, + "reference_fi": None, "share_of_drifted_columns": 1, } diff --git a/tests/multitest/metrics/data_drift.py b/tests/multitest/metrics/data_drift.py index 3606bfc124..f3ed4be55b 100644 --- a/tests/multitest/metrics/data_drift.py +++ b/tests/multitest/metrics/data_drift.py @@ -9,6 +9,7 @@ from evidently.metrics.data_drift.data_drift_table import DataDriftTable from evidently.metrics.data_drift.dataset_drift_metric import DatasetDriftMetric from evidently.metrics.data_drift.embeddings_drift import EmbeddingsDriftMetric +from evidently.metrics.data_drift.feature_importance import FeatureImportanceMetric from evidently.metrics.data_drift.target_by_features_table import TargetByFeaturesTable from evidently.metrics.data_drift.text_descriptors_drift_metric import TextDescriptorsDriftMetric from evidently.metrics.data_drift.text_domain_classifier_drift_metric import TextDomainClassifierDriftMetric @@ -31,9 +32,14 @@ def comment(): ) +@metric +def feature_importance(): + return TestMetric("feature_importance", FeatureImportanceMetric(), NoopOutcome(), dataset_names=["bcancer"]) + + @metric def data_drift_table(): - return TestMetric("data_drift_table", DataDriftTable(), NoopOutcome()) + return TestMetric("data_drift_table", DataDriftTable(), NoopOutcome(), dataset_names=["adult"]) @metric