Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Categories filtering for nested properties #59

Merged
merged 19 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,6 @@ venv.bak/

# mypy
.mypy_cache/

.DS_Store
.vscode/
184 changes: 184 additions & 0 deletions jupyter_telemetry/_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
from collections import deque

from jsonschema import Draft7Validator, validators
from jsonschema.exceptions import ValidationError


class ExtractCategories(ValidationError):
"""
A special `jsonschema.ValidationError` that carries information about the
`categories` keyword, intended to be yielded whenever a `categories` keyword
is encountered during `jsonschema` JSON validation.

The primary use case for this class is to make use of the JSON validation
mechanism implemented by `jsonschema` to extract all categories associated
with each property in a JSON instance based on a JSON schema. It is not
intended to be used as an actual validation error.
"""

def __init__(self, property, categories, *args, **kwargs):
super(ValidationError, self).__init__(*args, **kwargs)
self.property = property
self.categories = categories


def extend_with_categories(validator_class):
"""
Extend a `jsonschema.IValidator` class so that it yields a `_ExtractCategories`
whenever a `categories` keyword is encountered during JSON validation

Parameters
----------
validator_class : jsonschema.IValidator
an existing validator class

Returns
-------
jsonschema.IValidator
a new `jsonschema.IValidator` class extending the one provided

Examples
--------
from jsonschema import Draft7Validator


CategoryExtractor = extend_with_categories(Draft7Validator)
"""
validate_properties = validator_class.VALIDATORS["properties"]

def get_categories(validator, properties, instance, schema):
for property, subschema in properties.items():
if "categories" in subschema:
yield ExtractCategories(property, subschema["categories"], message=None)

for error in validate_properties(
validator, properties, instance, schema,
):
yield error

return validators.extend(
validator_class, {"properties": get_categories},
)


JSONSchemaValidator = Draft7Validator
CategoryExtractor = extend_with_categories(JSONSchemaValidator)


# Ignore categories under any of these jsonschema keywords
IGNORE_CATEGORIES_SCHEMA_KEYWORDS = {
'if', 'not', 'anyOf', 'oneOf', 'then', 'else'
}


def extract_categories_from_errors(errors):
for e in errors:
if (
isinstance(e, ExtractCategories) and
not any(p in IGNORE_CATEGORIES_SCHEMA_KEYWORDS
for p in e.absolute_schema_path)
):
yield e
else:
yield from extract_categories_from_errors(e.context)


def extract_categories_from_event(event, schema):
"""
Generate a `dict` of `_ExtractCategories` whose keys are pointers to the properties

Parameters
----------
event : dict
A telemetry event

schema : dict
A JSON schema

Returns
-------
dict
A mapping from properties in the event to their categories.

In each entry, the key is a pointer to a property in the event
(in the form of a tuple) and the value is a `_ExtractCategories`
containing the categories associated with that property.
"""
return {
tuple(c.absolute_path + deque([c.property])): c
for c in extract_categories_from_errors(
CategoryExtractor(schema).iter_errors(event)
)
}


def filter_categories_from_event(event, schema, allowed_categories, allowed_properties):
"""
Filter properties from an event based on their categories.

Only whitelisted properties and properties whose categories are allowed are kept.

Parameters
----------
event : dict
The input telemetry event

schema : dict
A JSON schema that makes use of the the `categories` keyword to
specify what categories are associated with a certain property.

allowed_categories : set
Specify which categories are allowed

allowed_properties : set
Whitelist certain top level properties.

These properties are included in the output event even if not all of
their properties are allowed.

Returns
-------
dict
The output event after category filtering

"""
categories = extract_categories_from_event(event, schema)

# Top-level properties without declared categories are set to null
for property in event.keys():
path = (property,)
if path not in categories:
event[property] = None

# Allow only properties whose categories are included in allowed_categories
# and whose top-level parent is included in allowed_properties
not_allowed = (
c for p, c in categories.items()
if not (set(c.categories).issubset(allowed_categories) or
p[0] in allowed_properties)
)

for c in not_allowed:
# In case both a sub property and its parent, e.g. ['user', 'name'] and
# ['user'], do not have all the allowed categories and are to be removed,
# if the parent is removed first then attempting to access
# the descendent would either return None or raise an IndexError or
# KeyError. Just skip it.
try:
item = deep_get(event, c.absolute_path)
except IndexError:
continue
except KeyError:
continue

if item is not None:
item[c.property] = None

return event


def deep_get(instance, path):
result = instance
while result is not None and path:
result = result[path.popleft()]
return result
1 change: 1 addition & 0 deletions jupyter_telemetry/categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ._categories import JSONSchemaValidator, filter_categories_from_event # noqa
26 changes: 8 additions & 18 deletions jupyter_telemetry/eventlog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging
from datetime import datetime

import jsonschema
from pythonjsonlogger import jsonlogger
try:
from ruamel.yaml import YAML
Expand All @@ -29,6 +28,8 @@
from .traits import Handlers, SchemaOptions
from . import TELEMETRY_METADATA_VERSION

from .categories import JSONSchemaValidator, filter_categories_from_event

yaml = YAML(typ='safe')


Expand Down Expand Up @@ -121,7 +122,7 @@ def register_schema(self, schema):
"""
# Check if our schema itself is valid
# This throws an exception if it isn't valid
jsonschema.validators.validator_for(schema).check_schema(schema)
JSONSchemaValidator.check_schema(schema)

# Check that the properties we require are present
required_schema_fields = {'$id', 'version', 'properties'}
Expand Down Expand Up @@ -192,7 +193,7 @@ def record_event(self, schema_name, version, event, timestamp_override=None):
schema = self.schemas[(schema_name, version)]

# Validate the event data.
jsonschema.validate(event, schema)
JSONSchemaValidator(schema).validate(event)

# Generate the empty event capsule.
if timestamp_override is None:
Expand All @@ -211,20 +212,9 @@ def record_event(self, schema_name, version, event, timestamp_override=None):
allowed_categories = self.get_allowed_categories(schema_name)
allowed_properties = self.get_allowed_properties(schema_name)

# Iterate through the event properties, and only record the
# properties labelled with allowed_categories
for property_name, data in event.items():
prop_categories = schema["properties"][property_name]["categories"]
# If the property is explicitly listed in
# the allowed_properties, then include it in the capsule
if property_name in allowed_properties:
capsule[property_name] = data
# All of the property categories must be listed in the the allowed
# categories for this property to be recorded.
elif any([cat in allowed_categories for cat in prop_categories]):
capsule[property_name] = data
# Else return that property with a value of null
else:
capsule[property_name] = None
filtered_event = filter_categories_from_event(
event, schema, allowed_categories, allowed_properties
)
capsule.update(filtered_event)

self.log.info(capsule)
Empty file added tests/__init__.py
Empty file.
37 changes: 11 additions & 26 deletions tests/test_allowed_schemas.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import io
import json
import logging
from textwrap import dedent as _
from ruamel.yaml import YAML

from jupyter_telemetry.eventlog import EventLog

import pytest

from .utils import get_event_data


SCHEMA_ID = "test.event"
VERSION = 1


@pytest.fixture
def schema():
return {
return {
'$id': SCHEMA_ID,
'title': 'Test Event',
'version': VERSION,
Expand Down Expand Up @@ -103,13 +103,13 @@ def test_missing_categories_label():
assert 'All properties must have a "categories"' in str(err.value)



EVENT_DATA = {
'nothing-exciting': 'hello, world',
'id': 'test id',
'email': '[email protected]',
}


@pytest.mark.parametrize(
'allowed_schemas,expected_output',
[
Expand Down Expand Up @@ -198,28 +198,13 @@ def test_missing_categories_label():
]
)
def test_allowed_schemas(schema, allowed_schemas, expected_output):
sink = io.StringIO()

# Create a handler that captures+records events with allowed tags.
handler = logging.StreamHandler(sink)

e = EventLog(
handlers=[handler],
allowed_schemas=allowed_schemas
event_data = get_event_data(
EVENT_DATA,
schema,
SCHEMA_ID,
VERSION,
allowed_schemas
)
e.register_schema(schema)

event = {
'nothing-exciting': 'hello, world',
'id': 'test id',
'email': '[email protected]',
}

# Record event and read output
e.record_event(SCHEMA_ID, VERSION, EVENT_DATA)
recorded_event = json.loads(sink.getvalue())
event_data = {key: value for key, value in recorded_event.items() if not key.startswith('__')}

# Verify that *exactly* the right properties are recorded.
assert expected_output == event_data

Loading