Skip to content

Commit

Permalink
Merge branch 'sampling'
Browse files Browse the repository at this point in the history
  • Loading branch information
Vyacheslav Morov committed Jun 18, 2021
2 parents 0341b3f + 44ca361 commit 16b777a
Show file tree
Hide file tree
Showing 88 changed files with 808 additions and 614 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ evidently/examples/.DS_Store
dist
build
MANIFEST

__pycache__
22 changes: 22 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"data_format": {
"separator": ",",
"header": true,
"date_column": "dteday"
},
"column_mapping" : {},
"profile_sections": ["data_drift"],
"pretty_print": true,
"sampling": {
"reference": {
"type": "none",
"n": 1,
"ratio": 0.1
},
"current": {
"type": "nth",
"n": 2,
"ratio": 0.1
}
}
}
18 changes: 18 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
data_format:
separator: ","
header: true
date_column: "dteday"
column_mapping: {}
profile_sections:
- "data_drift"
pretty_print: true
sampling:
reference:
type: "simple" # could be "none", "simple", "random"
n: 5 # used with simple sampling, number of rows to skip
ratio: 0.1 # used with random sampling, part of data to take from chunk
random_seed: 4 # used with random sampling, used as seed for random generator
current:
type: "nth" # could be "none", "simple", "random"
n: 5 # used with simple sampling, number of rows to skip
ratio: 0.1 # used with random sampling, part of data to take from chunk
76 changes: 63 additions & 13 deletions evidently/__main__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import argparse
import json
import logging
import os
import sys
from typing import Dict, List

from dataclasses import dataclass

import yaml

from evidently.runner.dashboard_runner import DashboardRunnerOptions, DashboardRunner
from evidently.runner.loader import SamplingOptions
from evidently.runner.profile_runner import ProfileRunner, ProfileRunnerOptions
from evidently.runner.runner import DataOptions

Expand All @@ -18,10 +22,17 @@ class DataFormatOptions:
date_column: str


@dataclass
class Sampling:
reference: SamplingOptions
current: SamplingOptions


@dataclass
class CalculateOptions:
data_format: DataFormatOptions
column_mapping: Dict[str, str]
sampling: Sampling


@dataclass
Expand All @@ -35,22 +46,42 @@ class ProfileOptions(CalculateOptions):
pretty_print: bool = False


def __get_not_none(d, key, default):
return default if d.get(key, None) is None else d.get(key)


def calculate_dashboard(config: str, reference: str, current: str, output_path: str, report_name: str, **_kv):
with open(config) as f_config:
opts_data = json.load(f_config)
if config.endswith(".yaml") or config.endswith(".yml"):
opts_data = yaml.load(f_config, Loader=yaml.SafeLoader)
elif config.endswith(".json"):
opts_data = json.load(f_config)
else:
raise Exception(f"config .{config.split('.')[-1]} not supported")

sampling = __get_not_none(opts_data, "sampling", {})
ref_sampling = __get_not_none(sampling, "reference", {})
cur_sampling = __get_not_none(sampling, "current", {})

opts = DashboardOptions(data_format=DataFormatOptions(**opts_data["data_format"]),
column_mapping=opts_data["column_mapping"],
dashboard_tabs=opts_data["dashboard_tabs"])
dashboard_tabs=opts_data["dashboard_tabs"],
sampling=Sampling(
reference=SamplingOptions(**ref_sampling),
current=SamplingOptions(**cur_sampling),
))

runner = DashboardRunner(DashboardRunnerOptions(
reference_data_path=reference,
reference_data_options=DataOptions(date_column=opts.data_format.date_column,
separator=opts.data_format.separator,
header=opts.data_format.header),
production_data_path=current,
production_data_options=DataOptions(date_column=opts.data_format.date_column,
separator=opts.data_format.separator,
header=opts.data_format.header),
reference_data_sampling=opts.sampling.reference,
current_data_path=current,
current_data_options=DataOptions(date_column=opts.data_format.date_column,
separator=opts.data_format.separator,
header=opts.data_format.header),
current_data_sampling=opts.sampling.current,
dashboard_tabs=opts.dashboard_tabs,
column_mapping=opts.column_mapping,
output_path=os.path.join(output_path, report_name),
Expand All @@ -60,21 +91,37 @@ def calculate_dashboard(config: str, reference: str, current: str, output_path:

def calculate_profile(config: str, reference: str, current: str, output_path: str, report_name: str, **_kv):
with open(config) as f_config:
opts_data = json.load(f_config)
if config.endswith(".yaml") or config.endswith(".yml"):
opts_data = yaml.load(f_config, Loader=yaml.SafeLoader)
elif config.endswith(".json"):
opts_data = json.load(f_config)
else:
raise Exception(f"config .{config.split('.')[-1]} not supported")

sampling = __get_not_none(opts_data, "sampling", {})
ref_sampling = __get_not_none(sampling, "reference", {})
cur_sampling = __get_not_none(sampling, "current", {})

opts = ProfileOptions(data_format=DataFormatOptions(**opts_data["data_format"]),
column_mapping=opts_data["column_mapping"],
profile_parts=opts_data["profile_sections"],
pretty_print=opts_data["pretty_print"])
pretty_print=opts_data["pretty_print"],
sampling=Sampling(
reference=SamplingOptions(**ref_sampling),
current=SamplingOptions(**cur_sampling),
))

runner = ProfileRunner(ProfileRunnerOptions(
reference_data_path=reference,
reference_data_options=DataOptions(date_column=opts.data_format.date_column,
separator=opts.data_format.separator,
header=opts.data_format.header),
production_data_path=current,
production_data_options=DataOptions(date_column=opts.data_format.date_column,
separator=opts.data_format.separator,
header=opts.data_format.header),
reference_data_sampling=opts.sampling.reference,
current_data_path=current,
current_data_options=DataOptions(date_column=opts.data_format.date_column,
separator=opts.data_format.separator,
header=opts.data_format.header),
current_data_sampling=opts.sampling.current,
profile_parts=opts.profile_parts,
column_mapping=opts.column_mapping,
output_path=os.path.join(output_path, report_name),
Expand All @@ -92,10 +139,13 @@ def _add_default_parameters(configurable_parser, default_output_name: str):
configurable_parser.add_argument("--reference", dest="reference", required=True, help="Path to reference data")
configurable_parser.add_argument("--current", dest="current", help="Path to current data")
configurable_parser.add_argument("--output_path", dest="output_path", required=True, help="Path to store report")
configurable_parser.add_argument("--report_name", dest="report_name", default=default_output_name, help="Report name")
configurable_parser.add_argument("--report_name", dest="report_name", default=default_output_name,
help="Report name")
configurable_parser.add_argument("--config", dest="config", required=True, help="Path to configuration")


logging.basicConfig(level=logging.INFO)

parser = argparse.ArgumentParser()

parsers = parser.add_subparsers()
Expand Down
34 changes: 17 additions & 17 deletions evidently/analyzers/cat_target_drift_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class CatTargetDriftAnalyzer(Analyzer):
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping):
def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping):
result = dict()
if column_mapping:
date_column = column_mapping.get('datetime')
Expand Down Expand Up @@ -49,25 +49,25 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame,
reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
reference_data.dropna(axis=0, how='any', inplace=True)

production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
production_data.dropna(axis=0, how='any', inplace=True)
current_data.replace([np.inf, -np.inf], np.nan, inplace=True)
current_data.dropna(axis=0, how='any', inplace=True)

ref_feature_vc = reference_data[target_column].value_counts()
prod_feature_vc = production_data[target_column].value_counts()
current_feature_vc = current_data[target_column].value_counts()

keys = set(list(reference_data[target_column].unique()) +
list(production_data[target_column].unique()))
list(current_data[target_column].unique()))

ref_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(ref_feature_vc.index, ref_feature_vc.values):
ref_feature_dict[key] = item

prod_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(prod_feature_vc.index, prod_feature_vc.values):
prod_feature_dict[key] = item
current_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(current_feature_vc.index, current_feature_vc.values):
current_feature_dict[key] = item

f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(prod_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]

target_p_value = chisquare(f_exp, f_obs)[1]
result['metrics']["target_name"] = target_column
Expand All @@ -80,25 +80,25 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame,
reference_data.replace([np.inf, -np.inf], np.nan, inplace=True)
reference_data.dropna(axis=0, how='any', inplace=True)

production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
production_data.dropna(axis=0, how='any', inplace=True)
current_data.replace([np.inf, -np.inf], np.nan, inplace=True)
current_data.dropna(axis=0, how='any', inplace=True)

ref_feature_vc = reference_data[prediction_column].value_counts()
prod_feature_vc = production_data[prediction_column].value_counts()
current_feature_vc = current_data[prediction_column].value_counts()

keys = set(list(reference_data[prediction_column].unique()) +
list(production_data[prediction_column].unique()))
list(current_data[prediction_column].unique()))

ref_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(ref_feature_vc.index, ref_feature_vc.values):
ref_feature_dict[key] = item

prod_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(prod_feature_vc.index, prod_feature_vc.values):
prod_feature_dict[key] = item
current_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(current_feature_vc.index, current_feature_vc.values):
current_feature_dict[key] = item

f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(prod_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]

pred_p_value = chisquare(f_exp, f_obs)[1]
result['metrics']["prediction_name"] = prediction_column
Expand Down
24 changes: 12 additions & 12 deletions evidently/analyzers/classification_performance_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sklearn import metrics

class ClassificationPerformanceAnalyzer(Analyzer):
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping):
def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping):
result = dict()
if column_mapping:
date_column = column_mapping.get('datetime')
Expand Down Expand Up @@ -83,18 +83,18 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame,
result['metrics']['reference']['confusion_matrix']['labels'] = labels
result['metrics']['reference']['confusion_matrix']['values'] = conf_matrix.tolist()

if production_data is not None:
production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
production_data.dropna(axis=0, how='any', inplace=True)
if current_data is not None:
current_data.replace([np.inf, -np.inf], np.nan, inplace=True)
current_data.dropna(axis=0, how='any', inplace=True)

result['metrics']['current'] = {}

accuracy_score = metrics.accuracy_score(production_data[target_column], production_data[prediction_column])
avg_precision = metrics.precision_score(production_data[target_column], production_data[prediction_column],
accuracy_score = metrics.accuracy_score(current_data[target_column], current_data[prediction_column])
avg_precision = metrics.precision_score(current_data[target_column], current_data[prediction_column],
average='macro')
avg_recall = metrics.recall_score(production_data[target_column], production_data[prediction_column],
avg_recall = metrics.recall_score(current_data[target_column], current_data[prediction_column],
average='macro')
avg_f1 = metrics.f1_score(production_data[target_column], production_data[prediction_column],
avg_f1 = metrics.f1_score(current_data[target_column], current_data[prediction_column],
average='macro')

result['metrics']['current']['accuracy'] = accuracy_score
Expand All @@ -103,15 +103,15 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame,
result['metrics']['current']['f1'] = avg_f1

#calculate class support and metrics matrix
metrics_matrix = metrics.classification_report(production_data[target_column], production_data[prediction_column],
metrics_matrix = metrics.classification_report(current_data[target_column], current_data[prediction_column],
output_dict=True)

result['metrics']['current']['metrics_matrix'] = metrics_matrix

#calculate confusion matrix
conf_matrix = metrics.confusion_matrix(production_data[target_column],
production_data[prediction_column])
labels = target_names if target_names else sorted(set(production_data[target_column]))
conf_matrix = metrics.confusion_matrix(current_data[target_column],
current_data[prediction_column])
labels = target_names if target_names else sorted(set(current_data[target_column]))

result['metrics']['current']['confusion_matrix'] = {}
result['metrics']['current']['confusion_matrix']['labels'] = labels
Expand Down
20 changes: 10 additions & 10 deletions evidently/analyzers/data_drift_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


class DataDriftAnalyzer(Analyzer):
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping):
def calculate(self, reference_data: pd.DataFrame, current_data: pd.DataFrame, column_mapping):
result = dict()
if column_mapping:
date_column = column_mapping.get('datetime')
Expand Down Expand Up @@ -47,37 +47,37 @@ def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame,
result['metrics'] = {}
for feature_name in num_feature_names:
result['metrics'][feature_name] = dict(
prod_small_hist=[t.tolist() for t in np.histogram(production_data[feature_name][np.isfinite(production_data[feature_name])],
current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])],
bins=10, density=True)],
ref_small_hist=[t.tolist() for t in np.histogram(reference_data[feature_name][np.isfinite(reference_data[feature_name])],
bins=10, density=True)],
feature_type='num',
p_value=ks_2samp(reference_data[feature_name], production_data[feature_name])[1]
p_value=ks_2samp(reference_data[feature_name], current_data[feature_name])[1]
)

for feature_name in cat_feature_names:
ref_feature_vc = reference_data[feature_name][np.isfinite(reference_data[feature_name])].value_counts()
prod_feature_vc = production_data[feature_name][np.isfinite(production_data[feature_name])].value_counts()
current_feature_vc = current_data[feature_name][np.isfinite(current_data[feature_name])].value_counts()

keys = set(list(reference_data[feature_name][np.isfinite(reference_data[feature_name])].unique()) +
list(production_data[feature_name][np.isfinite(production_data[feature_name])].unique()))
list(current_data[feature_name][np.isfinite(current_data[feature_name])].unique()))

ref_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(ref_feature_vc.index, ref_feature_vc.values):
ref_feature_dict[key] = item

prod_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(prod_feature_vc.index, prod_feature_vc.values):
prod_feature_dict[key] = item
current_feature_dict = dict.fromkeys(keys, 0)
for key, item in zip(current_feature_vc.index, current_feature_vc.values):
current_feature_dict[key] = item

f_exp = [value[1] for value in sorted(ref_feature_dict.items())]
f_obs = [value[1] for value in sorted(prod_feature_dict.items())]
f_obs = [value[1] for value in sorted(current_feature_dict.items())]

# CHI2 to be implemented for cases with different categories
p_value = chisquare(f_exp, f_obs)[1]

result['metrics'][feature_name] = dict(
prod_small_hist=[t.tolist() for t in np.histogram(production_data[feature_name][np.isfinite(production_data[feature_name])],
current_small_hist=[t.tolist() for t in np.histogram(current_data[feature_name][np.isfinite(current_data[feature_name])],
bins=10, density=True)],
ref_small_hist=[t.tolist() for t in np.histogram(reference_data[feature_name][np.isfinite(reference_data[feature_name])],
bins=10, density=True)],
Expand Down
Loading

0 comments on commit 16b777a

Please sign in to comment.