Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,002 changes: 1,002 additions & 0 deletions examples/cookbook/correlations_metrics.ipynb

Large diffs are not rendered by default.

252 changes: 126 additions & 126 deletions examples/cookbook/metrics.ipynb

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions src/evidently/core/metric_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from typing import Union

import numpy as np
import pandas as pd
import typing_inspect

from evidently._pydantic_compat import BaseModel
Expand Down Expand Up @@ -460,6 +461,33 @@ def set_metric_location(self, metric: MetricConfig):
self.std.metric_value_location = mean_std_value_location(metric, False)


class DataframeValue(MetricResult):
value: pd.DataFrame
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this correctly serialize into json with arbitrary dataframes?
If we allow this to be a type in result - so we should be ready to support all possible data of this type and this can be challenging.


def set_metric_location(self, metric: MetricConfig):
self.metric_value_location = dataframe_value_location(metric)

def to_simple_dict(self) -> object:
return self.value.to_dict()

def iter_single_values(self) -> typing.Iterator[SingleValue]:
df = self.value
label_columns = df.select_dtypes(exclude=["number"]).columns.tolist()
value_columns = df.select_dtypes(include=["number"]).columns.tolist()
assert self.metric_value_location is not None
metric = self.metric_value_location.metric
for index, row in df.iterrows():
data = row.to_dict()
labels = {col: str(data[col]) for col in label_columns}
for column in value_columns:
value = data[column]
yield SingleValue(
value=value,
display_name=column,
metric_value_location=MetricValueLocation(metric, {"column": column, **labels}),
)


class DatasetType(enum.Enum):
Current = "current"
Reference = "reference"
Expand All @@ -473,6 +501,10 @@ def by_label_location(metric: MetricConfig, label: Label) -> MetricValueLocation
return MetricValueLocation(metric, {"label": label})


def dataframe_value_location(metric: MetricConfig) -> MetricValueLocation:
return MetricValueLocation(metric, {})


ByLabelCountSlot = Union[Literal["count"], Literal["share"]]


Expand Down
2 changes: 2 additions & 0 deletions src/evidently/core/registries/metric_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@
register_type_alias(MetricResult, "evidently.core.metric_types.CountValue", "evidently:metric_result_v2:CountValue")
register_type_alias(MetricResult, "evidently.core.metric_types.MeanStdValue", "evidently:metric_result_v2:MeanStdValue")
register_type_alias(MetricResult, "evidently.core.metric_types.SingleValue", "evidently:metric_result_v2:SingleValue")

register_type_alias(MetricResult, "evidently.core.metric_types.DataframeValue", "evidently:metric_result_v2:DataframeValue")
4 changes: 4 additions & 0 deletions src/evidently/core/registries/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,7 @@

register_type_alias(Metric, "evidently.metrics.regression.MeanStdRegressionMetric", "evidently:metric_v2:MeanStdRegressionMetric")
register_type_alias(Metric, "evidently.metrics.regression.SingleValueRegressionMetric", "evidently:metric_v2:SingleValueRegressionMetric")

register_type_alias(Metric, "evidently.metrics.data_quality.ColumnCorrelations", "evidently:metric_v2:ColumnCorrelations")

register_type_alias(Metric, "evidently.metrics.data_quality.DatasetCorrelations", "evidently:metric_v2:DatasetCorrelations")
12 changes: 10 additions & 2 deletions src/evidently/legacy/ui/demo_projects/bikes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,19 @@ def create_data():
if os.path.exists("Bike-Sharing-Dataset.zip"):
with open("Bike-Sharing-Dataset.zip", "rb") as f:
content = f.read()
elif os.path.exists("../../../../../test_data/bike_sharing_dataset.zip"):
with open("../../../../../test_data/bike_sharing_dataset.zip", "rb") as f:
content = f.read()
else:
content = requests.get(
response = requests.get(
"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip",
verify=False,
).content
)
if response.status_code != 200:
raise ValueError(f"Could not download bike sharing dataset. {response.text}")
if response.status_code == 200 and response.headers["content-type"] != "application/zip":
raise ValueError(f"Invalid bike sharing dataset content type: {response.headers['content-type']}.")
content = response.content
with zipfile.ZipFile(io.BytesIO(content)) as arc:
raw_data = pd.read_csv(
arc.open("hour.csv"),
Expand Down
5 changes: 5 additions & 0 deletions src/evidently/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
from .column_statistics import SumValue
from .column_statistics import UniqueValueCount
from .column_statistics import ValueDrift
from .data_quality import ColumnCorrelations
from .data_quality import DatasetCorrelations
from .dataset_statistics import AlmostConstantColumnsCount
from .dataset_statistics import AlmostDuplicatedColumnsCount
from .dataset_statistics import ColumnCount
Expand Down Expand Up @@ -142,4 +144,7 @@
"DummyFPR",
"DummyFNR",
"DummyAccuracy",
# Data Quality
"ColumnCorrelations",
"DatasetCorrelations",
]
87 changes: 87 additions & 0 deletions src/evidently/metrics/data_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple

from evidently.core.metric_types import BoundTest
from evidently.core.metric_types import DataframeValue
from evidently.core.metric_types import Metric
from evidently.core.report import Context
from evidently.legacy.metrics.data_quality.column_correlations_metric import ColumnCorrelationsMetric
from evidently.legacy.metrics.data_quality.column_correlations_metric import ColumnCorrelationsMetricResult
from evidently.legacy.metrics.data_quality.dataset_correlations_metric import DatasetCorrelationsMetric
from evidently.legacy.metrics.data_quality.dataset_correlations_metric import DatasetCorrelationsMetricResult
from evidently.legacy.model.widget import BaseWidgetInfo
from evidently.metrics._legacy import LegacyMetricCalculation


class ColumnCorrelations(Metric):
column_name: str

def get_bound_tests(self, context: "Context") -> Sequence[BoundTest]:
return []


class LegacyColumnCorrelationsCalculation(
LegacyMetricCalculation[
DataframeValue,
ColumnCorrelations,
ColumnCorrelationsMetricResult,
ColumnCorrelationsMetric,
],
):
def display_name(self) -> str:
return f"Correlations between {self.metric.column_name} column and all the other columns."

def calculate_value(
self, context: "Context", legacy_result: ColumnCorrelationsMetricResult, render: List[BaseWidgetInfo]
) -> Tuple[DataframeValue, Optional[DataframeValue]]:
current_result = legacy_result.current
current_correlations = next(iter(current_result.values()))
current_df = current_correlations.get_pandas()
current_value = DataframeValue(display_name=self.display_name(), value=current_df)
current_value.widget = render
reference_value = None
if legacy_result.reference is not None:
reference_result = next(iter(legacy_result.reference.values()))
reference_df = reference_result.get_pandas()
reference_value = DataframeValue(display_name=self.display_name(), value=reference_df)
reference_value.widget = []
return current_value, reference_value

def legacy_metric(self) -> ColumnCorrelationsMetric:
return ColumnCorrelationsMetric(column_name=self.metric.column_name)


class DatasetCorrelations(Metric):
def get_bound_tests(self, context: "Context") -> Sequence[BoundTest]:
return []


class LegacyDatasetCorrelationsCalculation(
LegacyMetricCalculation[
DataframeValue,
DatasetCorrelations,
DatasetCorrelationsMetricResult,
DatasetCorrelationsMetric,
],
):
def legacy_metric(self) -> DatasetCorrelationsMetric:
return DatasetCorrelationsMetric()

def calculate_value(
self, context: "Context", legacy_result: DatasetCorrelationsMetricResult, render: List[BaseWidgetInfo]
) -> Tuple[DataframeValue, Optional[DataframeValue]]:
current_result = legacy_result.current
current_df = next(iter(current_result.correlation.values()))
current_value = DataframeValue(display_name=self.display_name(), value=current_df)
current_value.widget = render
reference_value = None
if legacy_result.reference is not None:
reference_df = next(iter(legacy_result.reference.correlation.values()))
reference_value = DataframeValue(display_name=self.display_name(), value=reference_df)
reference_value.widget = []
return current_value, reference_value

def display_name(self) -> str:
return """Calculate different correlations with target, predictions and features"""
4 changes: 4 additions & 0 deletions src/evidently/ui/service/storage/local/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from evidently.core.metric_types import ByLabelCountValue
from evidently.core.metric_types import ByLabelValue
from evidently.core.metric_types import CountValue
from evidently.core.metric_types import DataframeValue
from evidently.core.metric_types import MeanStdValue
from evidently.core.metric_types import SingleValue
from evidently.core.serialization import SnapshotModel
Expand Down Expand Up @@ -296,6 +297,9 @@ def _add_snapshot_points_sync(self, project_id: ProjectID, snapshot_id: Snapshot
self._add_value(project_id, snapshot_id, snapshot.timestamp, value)
for value in result.shares.values():
self._add_value(project_id, snapshot_id, snapshot.timestamp, value)
elif isinstance(result, DataframeValue):
for value in result.iter_single_values():
self._add_value(project_id, snapshot_id, snapshot.timestamp, value)
else:
raise ValueError(f"type {type(result)} isn't supported")

Expand Down
25 changes: 25 additions & 0 deletions test_data/bike_sharing_dataset.CITATION
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
=========================================
License
=========================================
Use of this dataset in publications must be cited to the following publication:

[1] Fanaee-T, Hadi, and Gama, Joao, "Event labeling combining ensemble detectors and background knowledge", Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg, doi:10.1007/s13748-013-0040-3.

@article{
year={2013},
issn={2192-6352},
journal={Progress in Artificial Intelligence},
doi={10.1007/s13748-013-0040-3},
title={Event labeling combining ensemble detectors and background knowledge},
url={http://dx.doi.org/10.1007/s13748-013-0040-3},
publisher={Springer Berlin Heidelberg},
keywords={Event labeling; Event detection; Ensemble learning; Background knowledge},
author={Fanaee-T, Hadi and Gama, Joao},
pages={1-15}
}

=========================================
Contact
=========================================

For further information about this dataset please contact Hadi Fanaee-T ([email protected])
Binary file added test_data/bike_sharing_dataset.zip
Binary file not shown.
56 changes: 56 additions & 0 deletions tests/future/metrics/test_correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import numpy as np
import pandas as pd

from evidently import BinaryClassification
from evidently import DataDefinition
from evidently import Dataset
from evidently import Report
from evidently.core.metric_types import DataframeValue
from evidently.metrics import ColumnCorrelations
from evidently.metrics.data_quality import DatasetCorrelations


def test_column_correlations():
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
ds = Dataset.from_pandas(df)

metric = ColumnCorrelations(column_name="a")
report = Report(metrics=[metric])

run = report.run(ds)

result = run.context.get_metric_result(metric)
assert isinstance(result, DataframeValue)
pd.testing.assert_frame_equal(result.value, pd.DataFrame([{"kind": "cramer_v", "column_name": "b", "value": 1.0}]))


def test_dataset_correlations():
df = pd.DataFrame(
{
"my_target": [1, np.nan, 3] * 1000,
"my_prediction": [1, 2, np.nan] * 1000,
"feature_1": [1, 2, 3] * 1000,
"feature_2": ["a", np.nan, "a"] * 1000,
}
)
ds = Dataset.from_pandas(
df,
data_definition=DataDefinition(
classification=[BinaryClassification(target="my_target", prediction_labels="my_prediction")]
),
)

metric = DatasetCorrelations()
report = Report(metrics=[metric])

run = report.run(ds)

result = run.context.get_metric_result(metric)
assert isinstance(result, DataframeValue)
pd.testing.assert_frame_equal(
result.value,
pd.DataFrame(
[{"my_target": 1, "my_prediction": np.nan}, {"my_target": np.nan, "my_prediction": 1}],
index=["my_target", "my_prediction"],
),
)