-
Notifications
You must be signed in to change notification settings - Fork 622
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add feature importance * Linters * add spark support --------- Co-authored-by: 0lgaF <[email protected]> Co-authored-by: mike0sv <[email protected]>
- Loading branch information
1 parent
2b7a0af
commit 4258133
Showing
6 changed files
with
228 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
from typing import Dict | ||
from typing import List | ||
from typing import Optional | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.preprocessing import OrdinalEncoder | ||
|
||
from evidently.base_metric import InputData | ||
from evidently.base_metric import Metric | ||
from evidently.base_metric import MetricResult | ||
from evidently.core import ColumnType | ||
from evidently.model.widget import BaseWidgetInfo | ||
from evidently.renderers.base_renderer import MetricRenderer | ||
from evidently.renderers.base_renderer import default_renderer | ||
from evidently.utils.data_preprocessing import DataDefinition | ||
|
||
SAMPLE_SIZE = 5000 | ||
|
||
|
||
class FeatureImportanceMetricResult(MetricResult): | ||
current: Optional[Dict[str, float]] = None | ||
reference: Optional[Dict[str, float]] = None | ||
|
||
|
||
class FeatureImportanceMetric(Metric[FeatureImportanceMetricResult]): | ||
def calculate(self, data: InputData) -> FeatureImportanceMetricResult: | ||
if data.additional_datasets.get("current_feature_importance") is not None: | ||
return FeatureImportanceMetricResult( | ||
current=data.additional_datasets.get("current_feature_importance"), | ||
reference=data.additional_datasets.get("reference_feature_importance"), | ||
) | ||
|
||
curr_sampled_data = data.current_data.sample(min(SAMPLE_SIZE, data.current_data.shape[0]), random_state=0) | ||
ref_sampled_data: Optional[pd.DataFrame] = None | ||
if data.reference_data is not None: | ||
ref_sampled_data = data.reference_data.sample( | ||
min(SAMPLE_SIZE, data.reference_data.shape[0]), random_state=0 | ||
) | ||
|
||
return get_feature_importance_from_samples(data.data_definition, curr_sampled_data, ref_sampled_data) | ||
|
||
|
||
def get_feature_importance_from_samples( | ||
data_definition: DataDefinition, curr_sampled_data: pd.DataFrame, ref_sampled_data: Optional[pd.DataFrame] | ||
): | ||
num_cols = data_definition.get_columns(filter_def=ColumnType.Numerical, features_only=True) | ||
cat_cols = data_definition.get_columns(filter_def=ColumnType.Categorical, features_only=True) | ||
|
||
columns = [x.column_name for x in num_cols] + [x.column_name for x in cat_cols] | ||
|
||
for col in [x.column_name for x in cat_cols]: | ||
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan) | ||
curr_sampled_data[col] = enc.fit_transform(curr_sampled_data[col].values.reshape(-1, 1)) | ||
if ref_sampled_data is not None: | ||
ref_sampled_data[col] = enc.fit_transform(ref_sampled_data[col].values.reshape(-1, 1)) | ||
|
||
task = data_definition.task | ||
target_column = data_definition.get_target_column() | ||
if target_column is None: | ||
return FeatureImportanceMetricResult(current=None, reference=None) | ||
target_name = target_column.column_name | ||
if task == "regression": | ||
model = RandomForestRegressor(min_samples_leaf=10) | ||
else: | ||
model = RandomForestClassifier(min_samples_leaf=10) | ||
|
||
model.fit(curr_sampled_data[columns], curr_sampled_data[target_name]) | ||
current_fi = {x: np.round(y, 3) for x, y in zip(columns, model.feature_importances_)} | ||
|
||
reference_fi: Optional[Dict[str, float]] = None | ||
if ref_sampled_data is not None: | ||
model.fit(ref_sampled_data[columns], ref_sampled_data[target_name]) | ||
reference_fi = {x: np.round(y, 3) for x, y in zip(columns, model.feature_importances_)} | ||
return FeatureImportanceMetricResult(current=current_fi, reference=reference_fi) | ||
|
||
|
||
@default_renderer(wrap_type=FeatureImportanceMetric) | ||
class FeatureImportanceRenderer(MetricRenderer): | ||
def render_html(self, obj: FeatureImportanceMetric) -> List[BaseWidgetInfo]: | ||
return [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from .engine import SparkEngine | ||
from .metrics import data_drift | ||
from .metrics import feature_importance | ||
|
||
__all__ = ["SparkEngine", "data_drift"] | ||
__all__ = ["SparkEngine", "data_drift", "feature_importance"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from typing import Optional | ||
|
||
import pandas as pd | ||
|
||
from evidently.calculation_engine.engine import metric_implementation | ||
from evidently.metrics.data_drift.feature_importance import SAMPLE_SIZE | ||
from evidently.metrics.data_drift.feature_importance import FeatureImportanceMetric | ||
from evidently.metrics.data_drift.feature_importance import FeatureImportanceMetricResult | ||
from evidently.metrics.data_drift.feature_importance import get_feature_importance_from_samples | ||
from evidently.spark.engine import SparkInputData | ||
from evidently.spark.engine import SparkMetricImplementation | ||
|
||
|
||
@metric_implementation(FeatureImportanceMetric) | ||
class SparkFeatureImportanceMetric(SparkMetricImplementation[FeatureImportanceMetric]): | ||
def calculate(self, context, data: SparkInputData) -> FeatureImportanceMetricResult: | ||
if data.additional_datasets.get("current_feature_importance") is not None: | ||
return FeatureImportanceMetricResult( | ||
current=data.additional_datasets.get("current_feature_importance"), | ||
reference=data.additional_datasets.get("reference_feature_importance"), | ||
) | ||
|
||
cur_count = data.current_data.count() | ||
curr_sampled_data: pd.DataFrame = ( | ||
data.current_data.toPandas() | ||
if cur_count < SAMPLE_SIZE | ||
else data.current_data.sample(cur_count / SAMPLE_SIZE, seed=0).toPandas() | ||
) | ||
ref_sampled_data: Optional[pd.DataFrame] = None | ||
if data.reference_data is not None: | ||
ref_count = data.reference_data.count() | ||
ref_sampled_data = ( | ||
data.reference_data.toPandas() | ||
if ref_count < SAMPLE_SIZE | ||
else data.reference_data.sample(ref_count / SAMPLE_SIZE, seed=0).toPandas() | ||
) | ||
|
||
return get_feature_importance_from_samples(data.data_definition, curr_sampled_data, ref_sampled_data) |
Oops, something went wrong.