Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Categories filtering for nested properties #59

Merged
merged 19 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,6 @@ venv.bak/

# mypy
.mypy_cache/

.DS_Store
.vscode/
110 changes: 110 additions & 0 deletions jupyter_telemetry/_eventschema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from collections import deque

from jsonschema import Draft7Validator, validators
from jsonschema.exceptions import ValidationError


class ExtractCategories(ValidationError):
def __init__(self, property, categories, *args, **kwargs):
super(ValidationError, self).__init__(*args, **kwargs)
self.property = property
self.categories = categories


def extend_with_categories(validator_class):
"""
Extend the validator class so that during json schema validation, whenever
the keyword 'categories' is encountered in a valid context with regards to a
property, it yields an instance of ExtractCategories containing the
information needed for category filtering later.
"""
validate_properties = validator_class.VALIDATORS["properties"]

def get_categories(validator, properties, instance, schema):
for property, subschema in properties.items():
if "categories" in subschema:
yield ExtractCategories(property, subschema["categories"], message=None)

for error in validate_properties(
validator, properties, instance, schema,
):
yield error

return validators.extend(
validator_class, {"properties": get_categories},
)


JSONSchemaValidator = Draft7Validator
CategoryExtractor = extend_with_categories(JSONSchemaValidator)


# Ignore categories under any of these jsonschema keywords
IGNORE_CATEGORIES_SCHEMA_KEYWORDS = {
'if', 'not', 'anyOf', 'oneOf', 'then', 'else'
}


def extract_categories_from_errors(errors):
for e in errors:
if (
isinstance(e, ExtractCategories) and
not any(p in IGNORE_CATEGORIES_SCHEMA_KEYWORDS
for p in e.absolute_schema_path)
):
yield e
else:
yield from extract_categories_from_errors(e.context)


def extract_categories(instance, schema):
"""
Generate dict of ExtractCategories whose keys are pointers to the properties
"""
return {
tuple(c.absolute_path + deque([c.property])): c
for c in extract_categories_from_errors(
CategoryExtractor(schema).iter_errors(instance)
)
}


def filter_categories(instance, categories, allowed_categories, allowed_properties):
Zsailer marked this conversation as resolved.
Show resolved Hide resolved
# Top-level properties without declared categories are set to null
for property in instance.keys():
path = (property,)
if path not in categories:
instance[property] = None

# Allow only properties whose categories are included in allowed_categories
# and whose top-level parent is included in allowed_properties
not_allowed = (
c for p, c in categories.items()
if not (set(c.categories).issubset(allowed_categories) or
p[0] in allowed_properties)
)

for c in not_allowed:
# In case both a sub property and its parent, e.g. ['user', 'name'] and
# ['user'], do not have all the allowed categories and are to be removed,
# if the parent is removed first then attempting to access
# the descendent would either return None or raise an IndexError or
# KeyError. Just skip it.
try:
item = deep_get(instance, c.absolute_path)
except IndexError:
continue
except KeyError:
continue

if item is not None:
item[c.property] = None

return instance


def deep_get(instance, path):
result = instance
while result is not None and path:
result = result[path.popleft()]
return result
31 changes: 13 additions & 18 deletions jupyter_telemetry/eventlog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging
from datetime import datetime

import jsonschema
from pythonjsonlogger import jsonlogger
try:
from ruamel.yaml import YAML
Expand All @@ -29,6 +28,12 @@
from .traits import Handlers, SchemaOptions
from . import TELEMETRY_METADATA_VERSION

from ._eventschema import (
JSONSchemaValidator,
extract_categories,
filter_categories
)

yaml = YAML(typ='safe')


Expand Down Expand Up @@ -121,7 +126,7 @@ def register_schema(self, schema):
"""
# Check if our schema itself is valid
# This throws an exception if it isn't valid
jsonschema.validators.validator_for(schema).check_schema(schema)
JSONSchemaValidator.check_schema(schema)

# Check that the properties we require are present
required_schema_fields = {'$id', 'version', 'properties'}
Expand Down Expand Up @@ -192,7 +197,7 @@ def record_event(self, schema_name, version, event, timestamp_override=None):
schema = self.schemas[(schema_name, version)]

# Validate the event data.
jsonschema.validate(event, schema)
JSONSchemaValidator(schema).validate(event)

# Generate the empty event capsule.
if timestamp_override is None:
Expand All @@ -211,20 +216,10 @@ def record_event(self, schema_name, version, event, timestamp_override=None):
allowed_categories = self.get_allowed_categories(schema_name)
allowed_properties = self.get_allowed_properties(schema_name)

# Iterate through the event properties, and only record the
# properties labelled with allowed_categories
for property_name, data in event.items():
prop_categories = schema["properties"][property_name]["categories"]
# If the property is explicitly listed in
# the allowed_properties, then include it in the capsule
if property_name in allowed_properties:
capsule[property_name] = data
# All of the property categories must be listed in the the allowed
# categories for this property to be recorded.
elif any([cat in allowed_categories for cat in prop_categories]):
capsule[property_name] = data
# Else return that property with a value of null
else:
capsule[property_name] = None
categories = extract_categories(event, schema)
filtered_event = filter_categories(
event, categories, allowed_categories, allowed_properties
)
capsule.update(filtered_event)

self.log.info(capsule)
Empty file added tests/__init__.py
Empty file.
37 changes: 11 additions & 26 deletions tests/test_allowed_schemas.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import io
import json
import logging
from textwrap import dedent as _
from ruamel.yaml import YAML

from jupyter_telemetry.eventlog import EventLog

import pytest

from .utils import get_event_data


SCHEMA_ID = "test.event"
VERSION = 1


@pytest.fixture
def schema():
return {
return {
'$id': SCHEMA_ID,
'title': 'Test Event',
'version': VERSION,
Expand Down Expand Up @@ -103,13 +103,13 @@ def test_missing_categories_label():
assert 'All properties must have a "categories"' in str(err.value)



EVENT_DATA = {
'nothing-exciting': 'hello, world',
'id': 'test id',
'email': '[email protected]',
}


@pytest.mark.parametrize(
'allowed_schemas,expected_output',
[
Expand Down Expand Up @@ -198,28 +198,13 @@ def test_missing_categories_label():
]
)
def test_allowed_schemas(schema, allowed_schemas, expected_output):
sink = io.StringIO()

# Create a handler that captures+records events with allowed tags.
handler = logging.StreamHandler(sink)

e = EventLog(
handlers=[handler],
allowed_schemas=allowed_schemas
event_data = get_event_data(
EVENT_DATA,
schema,
SCHEMA_ID,
VERSION,
allowed_schemas
)
e.register_schema(schema)

event = {
'nothing-exciting': 'hello, world',
'id': 'test id',
'email': '[email protected]',
}

# Record event and read output
e.record_event(SCHEMA_ID, VERSION, EVENT_DATA)
recorded_event = json.loads(sink.getvalue())
event_data = {key: value for key, value in recorded_event.items() if not key.startswith('__')}

# Verify that *exactly* the right properties are recorded.
assert expected_output == event_data

Loading