Skip to content

Commit

Permalink
Fix writing segmented performance metrics to WhyLabs (#1386)
Browse files Browse the repository at this point in the history
## Description

Fixes #1385 by forcing v0 format for profiles that contain performance
metrics. We missed this check when writing segmented results as v1, but
we still need v0 for performance metrics message.

## Changes

- Switch back to v0 for performance metrics, added some checks and
logging
- Test updates to check that v0 preserves new metadata such as trace_id

- [x] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md)
and the [Code of Conduct](CODE_OF_CONDUCT.md).
  • Loading branch information
jamie256 authored Sep 27, 2023
1 parent c833439 commit 9ce3d41
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
8 changes: 7 additions & 1 deletion python/tests/api/logger/test_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def test_segment_write_roundtrip_versions(tmp_path: Any, v0) -> None:
input_rows = 10
segment_column = "col3"
number_of_segments = 2
trace_id = "123-456"
values_per_segment = input_rows / number_of_segments
d = {
"col1": [i for i in range(input_rows)],
Expand All @@ -220,7 +221,7 @@ def test_segment_write_roundtrip_versions(tmp_path: Any, v0) -> None:
df = pd.DataFrame(data=d)
test_segments = segment_on_column(segment_column)

results: SegmentedResultSet = why.log(df, schema=DatasetSchema(segments=test_segments))
results: SegmentedResultSet = why.log(df, trace_id=trace_id, schema=DatasetSchema(segments=test_segments))
assert results.count == number_of_segments
partitions = results.partitions
assert len(partitions) == 1
Expand Down Expand Up @@ -255,6 +256,11 @@ def test_segment_write_roundtrip_versions(tmp_path: Any, v0) -> None:
post_deserialization_first_view = roundtrip_profiles[0]
assert post_deserialization_first_view is not None
assert isinstance(post_deserialization_first_view, DatasetProfileView)

# check that trace_id is preserved round trip in metadata
assert post_deserialization_first_view.metadata
assert "whylabs.traceId" in post_deserialization_first_view.metadata
assert trace_id == post_deserialization_first_view.metadata["whylabs.traceId"]
pre_serialization_first_view = first_segment_profile.view()
pre_columns = pre_serialization_first_view.get_columns()
post_columns = post_deserialization_first_view.get_columns()
Expand Down
13 changes: 11 additions & 2 deletions python/whylogs/api/writer/whylabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,10 @@ def _write_segmented_reference_result_set(self, file: SegmentedResultSet, **kwar
upload_statuses = list()
for view, url in zip(files, upload_urls):
with tempfile.NamedTemporaryFile() as tmp_file:
view.write(file=tmp_file)
if kwargs.get("use_v0") is None or kwargs.get("use_v0"):
view.write(file=tmp_file, use_v0=True)
else:
view.write(file=tmp_file)
tmp_file.flush()
tmp_file.seek(0)

Expand Down Expand Up @@ -583,7 +586,13 @@ def write(self, file: Writable, **kwargs: Any) -> Tuple[bool, str]:
self._dataset_id = kwargs.get("dataset_id")

with tempfile.NamedTemporaryFile() as tmp_file:
view.write(file=tmp_file)
# currently whylabs is not ingesting the v1 format of segmented profiles as segmented
# so we default to sending them as v0 profiles if the override `use_v0` is not defined,
# if `use_v0` is defined then pass that through to control the serialization format.
if has_segments and (kwargs.get("use_v0") is None or kwargs.get("use_v0")):
view.write(file=tmp_file, use_v0=True)
else:
view.write(file=tmp_file)
tmp_file.flush()
tmp_file.seek(0)
utc_now = datetime.datetime.now(datetime.timezone.utc)
Expand Down
7 changes: 5 additions & 2 deletions python/whylogs/core/view/segmented_dataset_profile_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,11 @@ def _write_v1(self, path: Optional[str] = None, **kwargs: Any) -> Tuple[bool, st
return True, path

def write(self, path: Optional[str] = None, **kwargs: Any) -> Tuple[bool, str]:
if kwargs.get("use_v0"):
logger.warning("writing segmented profile as v0 format, some info may be converted")
if kwargs.get("use_v0") or self.profile_view.model_performance_metrics:
if self.profile_view.model_performance_metrics:
logger.info("Converting segmented profile with performance metrics to v0 format before writing.")
else:
logger.info("writing segmented profile as v0 format.")
return self._write_as_v0_message(path, **kwargs)
else:
return self._write_v1(path, **kwargs)

0 comments on commit 9ce3d41

Please sign in to comment.