Skip to content

Commit

Permalink
add descriptors to data def (#1165)
Browse files Browse the repository at this point in the history
* add descriptors to data def

* add class name

* lint

* move descriptors to separate field

* mypy

* fix
  • Loading branch information
mike0sv authored Jun 27, 2024
1 parent 6fe5ca8 commit 865824e
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/evidently/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def required_features(self, data_definition: DataDefinition) -> List["GeneratedF
for field, value in sorted(self.__dict__.items(), key=lambda x: x[0]):
if field in ["context"]:
continue
if issubclass(type(value), ColumnName) and value.feature_class is not None:
if isinstance(value, ColumnName) and value.feature_class is not None:
required_features.append(value.feature_class)
return required_features

Expand Down
12 changes: 8 additions & 4 deletions src/evidently/calculation_engine/engine.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import abc
import functools
import logging
from typing import TYPE_CHECKING
from typing import Dict
from typing import Generic
from typing import List
Expand All @@ -19,8 +20,11 @@
from evidently.features.generated_features import GeneratedFeature
from evidently.utils.data_preprocessing import DataDefinition

if TYPE_CHECKING:
from evidently.suite.base_suite import Context

TMetricImplementation = TypeVar("TMetricImplementation", bound=MetricImplementation)
TInputData = TypeVar("TInputData")
TInputData = TypeVar("TInputData", bound=GenericInputData)


class Engine(Generic[TMetricImplementation, TInputData]):
Expand All @@ -34,10 +38,10 @@ def set_metrics(self, metrics):
def set_tests(self, tests):
self.tests = tests

def execute_metrics(self, context, data: GenericInputData):
def execute_metrics(self, context: "Context", data: GenericInputData):
calculations: Dict[Metric, Union[ErrorResult, MetricResult]] = {}
converted_data = self.convert_input_data(data)
context.features = self.generate_additional_features(converted_data)
context.set_features(self.generate_additional_features(converted_data))
context.data = converted_data
for metric, calculation in self.get_metric_execution_iterator():
if calculation not in calculations:
Expand Down Expand Up @@ -65,7 +69,7 @@ def get_data_definition(
raise NotImplementedError()

@abc.abstractmethod
def generate_additional_features(self, data: TInputData):
def generate_additional_features(self, data: TInputData) -> Optional[Dict[tuple, GeneratedFeature]]:
raise NotImplementedError

def get_metric_implementation(self, metric):
Expand Down
4 changes: 2 additions & 2 deletions src/evidently/calculation_engine/python_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def get_data_definition(
raise ValueError("PandasEngine works only with pd.DataFrame input data")
return create_data_definition(reference_data, current_data, column_mapping, categorical_features_cardinality)

def generate_additional_features(self, data: PythonInputData):
def generate_additional_features(self, data: PythonInputData) -> Dict[tuple, GeneratedFeature]:
curr_additional_data = None
ref_additional_data = None
features = {}
features: Dict[tuple, GeneratedFeature] = {}
for metric, calculation in self.get_metric_execution_iterator():
try:
required_features = metric.required_features(data.data_definition)
Expand Down
4 changes: 3 additions & 1 deletion src/evidently/spark/engine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
from typing import Dict
from typing import Generic
from typing import List
from typing import Optional
Expand All @@ -16,6 +17,7 @@
from evidently.calculation_engine.engine import Engine
from evidently.calculation_engine.metric_implementation import MetricImplementation
from evidently.core import ColumnType
from evidently.features.generated_features import GeneratedFeature
from evidently.spark.base import SparkDataFrame
from evidently.spark.base import SparkSeries
from evidently.spark.base import create_data_definition_spark
Expand Down Expand Up @@ -127,7 +129,7 @@ def get_data_definition(
):
return create_data_definition_spark(current_data, reference_data, column_mapping)

def generate_additional_features(self, data: SparkInputData):
def generate_additional_features(self, data: SparkInputData) -> Optional[Dict[tuple, GeneratedFeature]]:
pass


Expand Down
22 changes: 22 additions & 0 deletions src/evidently/suite/base_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from evidently.utils.dashboard import save_data_file
from evidently.utils.dashboard import save_lib_files
from evidently.utils.data_preprocessing import DataDefinition
from evidently.utils.data_preprocessing import FeatureDefinition

USE_UJSON = False

Expand Down Expand Up @@ -91,6 +92,10 @@ def _discover_dependencies(test: Union[Metric, Test]) -> Iterator[Tuple[str, Uni
yield field_name, field


class RunMetadata(BaseModel):
descriptors: Dict[str, FeatureDefinition] = {}


@dataclasses.dataclass
class Context:
"""Pipeline execution context tracks pipeline execution and lifecycle"""
Expand All @@ -106,6 +111,7 @@ class Context:
features: Optional[Dict[tuple, GeneratedFeature]] = None
options: Options = Options()
data_definition: Optional["DataDefinition"] = None
run_metadata: RunMetadata = dataclasses.field(default_factory=RunMetadata)

def get_data_definition(
self,
Expand All @@ -125,6 +131,20 @@ def get_data_definition(
def get_datasets(self):
return self.engine.form_datasets(self.data, self.features, self.data_definition)

def set_features(self, features: Optional[Dict[tuple, GeneratedFeature]]):
if features is None:
return
self.features = features
for feature in features.values():
feature_name = feature.feature_name()
feature_class = feature_name.feature_class
self.run_metadata.descriptors[feature_name.name] = FeatureDefinition(
feature_name=feature_name.name,
display_name=feature_name.display_name,
feature_type=feature_class.feature_type, # type: ignore[union-attr]
feature_class=feature_class.__class__.__name__,
)


class ContextPayload(BaseModel):
metrics: List[Metric]
Expand All @@ -133,6 +153,7 @@ class ContextPayload(BaseModel):
test_results: List[TestResult]
options: Options = Options()
data_definition: Optional[DataDefinition]
run_metadata: RunMetadata = RunMetadata()

@classmethod
def from_context(cls, context: Context):
Expand All @@ -143,6 +164,7 @@ def from_context(cls, context: Context):
test_results=list(context.test_results.values()),
options=context.options,
data_definition=context.data_definition,
run_metadata=context.run_metadata,
)

def to_context(self) -> Context:
Expand Down
7 changes: 7 additions & 0 deletions src/evidently/utils/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ def __init__(self, column_name: str, column_type: ColumnType):
super().__init__(column_name=column_name, column_type=column_type)


class FeatureDefinition(BaseModel):
feature_name: str
display_name: Optional[str]
feature_type: ColumnType
feature_class: str


class PredictionColumns(BaseModel):
predicted_values: Optional[ColumnDefinition] = None
prediction_probas: Optional[List[ColumnDefinition]] = None
Expand Down

0 comments on commit 865824e

Please sign in to comment.