From 1090f7d6c455ebe9bb36e200e3679f2270e7d276 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Wed, 13 Jul 2022 15:06:53 -0500 Subject: [PATCH 01/14] Add an EventSchema and SchemaRegistry API --- jupyter_events/_categories.py | 185 ---------------- jupyter_events/categories.py | 2 - jupyter_events/logger.py | 207 +++++------------- jupyter_events/schema.py | 178 +++++++++++++++ jupyter_events/schema_registry.py | 62 ++++++ jupyter_events/schemas/event-metaschema.yml | 31 +++ .../schemas/property-metaschema.yml | 20 ++ jupyter_events/yaml.py | 21 ++ tests/test_category_filtering.py | 2 +- tests/test_eventlog.py | 52 ----- tests/test_register_schema.py | 70 +++--- tests/utils.py | 6 +- 12 files changed, 409 insertions(+), 427 deletions(-) delete mode 100644 jupyter_events/_categories.py delete mode 100644 jupyter_events/categories.py create mode 100644 jupyter_events/schema.py create mode 100644 jupyter_events/schema_registry.py create mode 100644 jupyter_events/schemas/event-metaschema.yml create mode 100644 jupyter_events/schemas/property-metaschema.yml create mode 100644 jupyter_events/yaml.py delete mode 100644 tests/test_eventlog.py diff --git a/jupyter_events/_categories.py b/jupyter_events/_categories.py deleted file mode 100644 index e7e8045..0000000 --- a/jupyter_events/_categories.py +++ /dev/null @@ -1,185 +0,0 @@ -from collections import deque - -from jsonschema import Draft7Validator, validators -from jsonschema.exceptions import ValidationError - - -class ExtractCategories(ValidationError): - """ - A special `jsonschema.ValidationError` that carries information about the - `categories` keyword, intended to be yielded whenever a `categories` keyword - is encountered during `jsonschema` JSON validation. - - The primary use case for this class is to make use of the JSON validation - mechanism implemented by `jsonschema` to extract all categories associated - with each property in a JSON instance based on a JSON schema. It is not - intended to be used as an actual validation error. - """ - - def __init__(self, property, categories, *args, **kwargs): - super(ValidationError, self).__init__(*args, **kwargs) - self.property = property - self.categories = categories - - -def extend_with_categories(validator_class): - """ - Extend a `jsonschema.IValidator` class so that it yields a `_ExtractCategories` - whenever a `categories` keyword is encountered during JSON validation - - Parameters - ---------- - validator_class : jsonschema.IValidator - an existing validator class - - Returns - ------- - jsonschema.IValidator - a new `jsonschema.IValidator` class extending the one provided - - Examples - -------- - from jsonschema import Draft7Validator - - - CategoryExtractor = extend_with_categories(Draft7Validator) - """ - validate_properties = validator_class.VALIDATORS["properties"] - - def get_categories(validator, properties, instance, schema): - for property, subschema in properties.items(): - if "categories" in subschema: - yield ExtractCategories(property, subschema["categories"], message=None) - - yield from validate_properties( - validator, - properties, - instance, - schema, - ) - - return validators.extend( - validator_class, - {"properties": get_categories}, - ) - - -JSONSchemaValidator = Draft7Validator -CategoryExtractor = extend_with_categories(JSONSchemaValidator) - - -# Ignore categories under any of these jsonschema keywords -IGNORE_CATEGORIES_SCHEMA_KEYWORDS = {"if", "not", "anyOf", "oneOf", "then", "else"} - - -def extract_categories_from_errors(errors): - for e in errors: - if isinstance(e, ExtractCategories) and not any( - p in IGNORE_CATEGORIES_SCHEMA_KEYWORDS for p in e.absolute_schema_path - ): - yield e - else: - yield from extract_categories_from_errors(e.context) - - -def extract_categories_from_event(event, schema): - """ - Generate a `dict` of `_ExtractCategories` whose keys are pointers to the properties - - Parameters - ---------- - event : dict - Event data - - schema : dict - A JSON schema - - Returns - ------- - dict - A mapping from properties in the event to their categories. - - In each entry, the key is a pointer to a property in the event - (in the form of a tuple) and the value is a `_ExtractCategories` - containing the categories associated with that property. - """ - return { - tuple(c.absolute_path + deque([c.property])): c - for c in extract_categories_from_errors( - CategoryExtractor(schema).iter_errors(event) - ) - } - - -def filter_categories_from_event(event, schema, allowed_categories, allowed_properties): - """ - Filter properties from an event based on their categories. - - Only whitelisted properties and properties whose categories are allowed are kept. - - Parameters - ---------- - event : dict - The input event - - schema : dict - A JSON schema that makes use of the the `categories` keyword to - specify what categories are associated with a certain property. - - allowed_categories : set - Specify which categories are allowed - - allowed_properties : set - Whitelist certain top level properties. - - These properties are included in the output event even if not all of - their properties are allowed. - - Returns - ------- - dict - The output event after category filtering - - """ - categories = extract_categories_from_event(event, schema) - - # Top-level properties without declared categories are set to null - for property in event.keys(): - path = (property,) - if path not in categories: - event[property] = None - - # Allow only properties whose categories are included in allowed_categories - # and whose top-level parent is included in allowed_properties - not_allowed = ( - c - for p, c in categories.items() - if not ( - set(c.categories).issubset(allowed_categories) or p[0] in allowed_properties - ) - ) - - for c in not_allowed: - # In case both a sub property and its parent, e.g. ['user', 'name'] and - # ['user'], do not have all the allowed categories and are to be removed, - # if the parent is removed first then attempting to access - # the descendent would either return None or raise an IndexError or - # KeyError. Just skip it. - try: - item = deep_get(event, c.absolute_path) - except IndexError: - continue - except KeyError: - continue - - if item is not None: - item[c.property] = None - - return event - - -def deep_get(instance, path): - result = instance - while result is not None and path: - result = result[path.popleft()] - return result diff --git a/jupyter_events/categories.py b/jupyter_events/categories.py deleted file mode 100644 index 8fff253..0000000 --- a/jupyter_events/categories.py +++ /dev/null @@ -1,2 +0,0 @@ -# flake8: noqa -from ._categories import JSONSchemaValidator, filter_categories_from_event diff --git a/jupyter_events/logger.py b/jupyter_events/logger.py index 61fbd03..34e7764 100644 --- a/jupyter_events/logger.py +++ b/jupyter_events/logger.py @@ -6,43 +6,12 @@ from datetime import datetime from pythonjsonlogger import jsonlogger - -try: - from ruamel.yaml import YAML -except ImportError as e: - # check for known conda bug that prevents - # pip from installing ruamel.yaml dependency - try: - import ruamel_yaml # noqa - except ImportError: - # nope, regular import error; raise original - raise e - else: - # have conda fork ruamel_yaml, but not ruamel.yaml. - # this is a bug in the ruamel_yaml conda package - # mistakenly identifying itself as ruamel.yaml to pip. - # conda install the 'real' ruamel.yaml to fix - raise ImportError( - "Missing dependency ruamel.yaml. Try: `conda install ruamel.yaml`" - ) - +from traitlets import Instance, List from traitlets.config import Config, Configurable from . import EVENTS_METADATA_VERSION -from .categories import JSONSchemaValidator, filter_categories_from_event -from .traits import Handlers, SchemaOptions - -yaml = YAML(typ="safe") - - -def _skip_message(record, **kwargs): - """ - Remove 'message' from log record. - It is always emitted with 'null', and we do not want it, - since we are always emitting events only - """ - del record["message"] - return json.dumps(record, **kwargs) +from .schema_registry import SchemaRegistry +from .traits import Handlers class EventLogger(Configurable): @@ -51,7 +20,7 @@ class EventLogger(Configurable): """ handlers = Handlers( - [], + default_value=[], allow_none=True, help="""A list of logging.Handler instances to send events to. @@ -59,21 +28,23 @@ class EventLogger(Configurable): """, ).tag(config=True) - allowed_schemas = SchemaOptions( - {}, - allow_none=True, - help=""" - Fully qualified names of schemas to record. + allowed_policies = List( + default_value=["all"], + help=( + """ + A list of the redaction policies that will not be redacted + from incoming, recorded events. + """ + ), + ) - Each schema you want to record must be manually specified. - The default, an empty list, means no events are recorded. - """, - ).tag(config=True) + schema_registry = Instance(SchemaRegistry) def __init__(self, *args, **kwargs): # We need to initialize the configurable before # adding the logging handlers. super().__init__(*args, **kwargs) + self.schema_registry = SchemaRegistry(allowed_policies=self.allowed_policies) # Use a unique name for the logger so that multiple instances of EventLog do not write # to each other's handlers. log_name = __name__ + "." + str(id(self)) @@ -82,13 +53,10 @@ def __init__(self, *args, **kwargs): self.log.propagate = False # We will use log.info to emit self.log.setLevel(logging.INFO) - self.schemas = {} # Add each handler to the logger and format the handlers. if self.handlers: - formatter = jsonlogger.JsonFormatter(json_serializer=_skip_message) for handler in self.handlers: - handler.setFormatter(formatter) - self.log.addHandler(handler) + self.add_handler(handler) def _load_config(self, cfg, section_names=None, traits=None): """Load EventLogger traits from a Config object, patching the @@ -107,94 +75,42 @@ def get_handlers(): eventlogger_cfg = Config({"EventLogger": my_cfg}) super()._load_config(eventlogger_cfg, section_names=None, traits=None) - def register_schema_file(self, filename): - """ - Convenience function for registering a JSON schema from a filepath - - Supports both JSON & YAML files. + def register_schema(self, schema: dict): + """Register events schema with the SchemaRegistry.""" + self.schema_registry.register(schema) - Parameters - ---------- - filename: str, path object or file-like object - Path to the schema file or a file object to register. - """ - # Just use YAML loader for everything, since all valid JSON is valid YAML + def register_schema_file(self, schema_filepath): + """Register events schema with the SchemaRegistry.""" + self.schema_registry.register_from_file(schema_filepath) - # check if input is a file-like object - if hasattr(filename, "read") and hasattr(filename, "write"): - self.register_schema(yaml.load(filename)) - else: - with open(filename) as f: - self.register_schema(yaml.load(f)) + def add_handler(self, handler: logging.Handler): + """Add a new logging handler to the Event Logger. - def register_schema(self, schema): + All outgoing messages will be formatted as a JSON string. """ - Register a given JSON Schema with this event emitter - 'version' and '$id' are required fields. - """ - # Check if our schema itself is valid - # This throws an exception if it isn't valid - JSONSchemaValidator.check_schema(schema) - - # Check that the properties we require are present - required_schema_fields = {"$id", "version", "properties"} - for rsf in required_schema_fields: - if rsf not in schema: - raise ValueError(f"{rsf} is required in schema specification") - - if (schema["$id"], schema["version"]) in self.schemas: - raise ValueError( - "Schema {} version {} has already been registered.".format( - schema["$id"], schema["version"] - ) - ) - - for p, attrs in schema["properties"].items(): - if p.startswith("__"): - raise ValueError( - "Schema {} has properties beginning with __, which is not allowed" - ) - - # Validate "categories" property in proposed schema. - try: - cats = attrs["categories"] - # Categories must be a list. - if not isinstance(cats, list): - raise ValueError( - 'The "categories" field in a registered schemas must be a list.' - ) - except KeyError: - raise KeyError( - 'All properties must have a "categories" field that describes ' - 'the type of data being collected. The "{}" property does not ' - "have a category field.".format(p) - ) - - self.schemas[(schema["$id"], schema["version"])] = schema - - def get_allowed_properties(self, schema_name): - """Get the allowed properties for an allowed schema.""" - config = self.allowed_schemas[schema_name] - try: - return set(config["allowed_properties"]) - except KeyError: - return set() - - def get_allowed_categories(self, schema_name): - """ - Return a set of allowed categories for a given schema - from the EventLog's config. - """ - config = self.allowed_schemas[schema_name] - try: - allowed_categories = config["allowed_categories"] - allowed_categories.append("unrestricted") - return set(allowed_categories) - except KeyError: - return {"unrestricted"} - - def record_event(self, schema_name, version, event, timestamp_override=None): + def _skip_message(record, **kwargs): + """ + Remove 'message' from log record. + It is always emitted with 'null', and we do not want it, + since we are always emitting events only + """ + del record["message"] + return json.dumps(record, **kwargs) + + formatter = jsonlogger.JsonFormatter(json_serializer=_skip_message) + handler.setFormatter(formatter) + self.log.addHandler(handler) + if handler not in self.handlers: + self.handlers.append(handler) + + def remove_handler(self, handler): + """Remove the logging handler from the logger and list of handlers.""" + self.log.removeHandler(handler) + if handler in self.handlers: + self.handlers.remove(handler) + + def emit(self, schema_name, version, event, timestamp_override=None): """ Record given event with schema has occurred. @@ -214,23 +130,11 @@ def record_event(self, schema_name, version, event, timestamp_override=None): dict The recorded event data """ - if not (self.handlers and schema_name in self.allowed_schemas): + if not self.handlers or (schema_name, version) not in self.schema_registry: # if handler isn't set up or schema is not explicitly whitelisted, # don't do anything return - if (schema_name, version) not in self.schemas: - raise ValueError( - "Schema {schema_name} version {version} not registered".format( - schema_name=schema_name, version=version - ) - ) - - schema = self.schemas[(schema_name, version)] - - # Validate the event data. - JSONSchemaValidator(schema).validate(event) - # Generate the empty event capsule. if timestamp_override is None: timestamp = datetime.utcnow() @@ -242,16 +146,9 @@ def record_event(self, schema_name, version, event, timestamp_override=None): "__schema_version__": version, "__metadata_version__": EVENTS_METADATA_VERSION, } - - # Filter properties in the incoming event based on the - # allowed categories and properties from the eventlog config. - allowed_categories = self.get_allowed_categories(schema_name) - allowed_properties = self.get_allowed_properties(schema_name) - - filtered_event = filter_categories_from_event( - event, schema, allowed_categories, allowed_properties - ) - capsule.update(filtered_event) - + schema = self.schema_registry.get((schema_name, version)) + schema.validate(event) + schema.enforce_redaction_policies(event) + capsule.update(event) self.log.info(capsule) return capsule diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py new file mode 100644 index 0000000..19b69ce --- /dev/null +++ b/jupyter_events/schema.py @@ -0,0 +1,178 @@ +import pathlib +from typing import Any, Dict, Hashable, List, Sequence, Union + +from jsonschema import RefResolver, validators + +from .yaml import yaml + + +def _nested_pop(dictionary: dict, nested_keys: Sequence[Hashable]) -> Any: + """Pop a item nested anywhere in a dwictionary using the + list of (hashable) keys to locate the item. + """ + d = dictionary + last_entry = nested_keys[-1] + for key in nested_keys[:-1]: + d = d[key] + return d.pop(last_entry) + + +def _get_redaction_policies(schema: dict): + """A recursive function that iterates an event schema + and returns a mapping of redaction policies to + (nested) properties (identified by a sequence of keys). + """ + redaction_policies: Dict[str, List[str]] = {} + + def _extract_policies(subschema, key_sequence=()): + props = subschema["properties"] + for key, obj in props.items(): + updated_key_sequence = key_sequence + (key,) + if isinstance(obj, dict) and "properties" in obj: + _extract_policies(obj, updated_key_sequence) + + # Update the list in place. + for policy in obj["redactionPolicy"]: + policies_list = redaction_policies.get(policy, []) + policies_list.append(updated_key_sequence) + redaction_policies[policy] = policies_list + + # Start the recursion + _extract_policies(schema) + return redaction_policies + + +METASCHEMA_PATH = pathlib.Path(__file__).parent.joinpath("schemas") +EVENT_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("event-metaschema.yml") +EVENT_METASCHEMA = yaml.load(EVENT_METASCHEMA_FILEPATH) +PROPERTY_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("property-metaschema.yml") +PROPERTY_METASCHEMA = yaml.load(PROPERTY_METASCHEMA_FILEPATH) +METASCHEMA_RESOLVER = RefResolver( + base_uri=EVENT_METASCHEMA["$id"], + referrer=EVENT_METASCHEMA, + store={PROPERTY_METASCHEMA["$id"]: PROPERTY_METASCHEMA}, +) +METASCHEMA_VALIDATOR = validators.Draft7Validator( + EVENT_METASCHEMA, resolver=METASCHEMA_RESOLVER +) + + +class EventSchema: + """A validated schema that can be used. + + On instantiation, validate the schema against + Jupyter Event's metaschema. + + Parameters + ---------- + schema: dict + JSON schema to validate against Jupyter Events. + + validator_class: jsonschema.validators + The validator class from jsonschema used to validate instances + of this event schema. The schema itself will be validated + against Jupyter Event's metaschema to ensure that + any schema registered here follows the expected form + of Jupyter Events. + + resolver: + RefResolver for nested JSON schema references. + + allowed_policies: set + A set of redaction policied allowed by this event schema. + Each property in the schema must have a `redactionPolicy` + annotation representing the level of sensitivity of the + data collected by this event. In order for that data + to be emitted of Jupyter Events, the matching redaction + policy must be listed here. + + """ + + def __init__( + self, + schema, + validator_class=validators.Draft7Validator, + resolver=None, + allowed_policies: Union[str, list] = "all", + ): + # Validate the schema against Jupyter Events metaschema. + METASCHEMA_VALIDATOR.validate(schema) + # Build a mapping of all property redaction policies. + self._redaction_policies = _get_redaction_policies(schema) + self._allowed_policies = self._validate_allowed_policies(allowed_policies) + # Create a validator for this schema + self._validator = validator_class(schema, resolver=resolver) + self._schema = schema + + def _validate_allowed_policies(self, allowed_policies): + value_type = type(allowed_policies) + if value_type == str and allowed_policies == "all": + return set(self.redaction_policies.keys()) + elif value_type == list: + return set(["unrestricted"] + list(allowed_policies)) + raise TypeError( + "allowed_policies must be the literal string, 'all', or a list of " + "redaction polices" + ) + + @property + def id(self): + """Schema $id field.""" + return self._schema["$id"] + + @property + def version(self): + """Schema's version.""" + return self._schema["version"] + + @property + def registry_key(self): + return (self.id, self.version) + + @property + def allowed_policies(self): + """The redaction policies that will not be redacted when an + incoming event is processed. + """ + return self._allowed_policies + + @classmethod + def from_file( + cls, + filepath, + validator_class=validators.Draft7Validator, + resolver=None, + allowed_policies="all", + ): + schema = yaml.load(filepath) + return cls( + schema=schema, + validator_class=validator_class, + resolver=resolver, + allowed_policies=allowed_policies, + ) + + @property + def redaction_policies(self) -> Dict[str, List[str]]: + """Mapping of the redaction policies in this schema to + the (nested) properties where they are defined. + """ + return self._redaction_policies + + def validate(self, data: dict) -> None: + """Validate an incoming instance of this event schema.""" + self._validator.validate(data) + + def enforce_redaction_policy(self, data: dict) -> None: + """Redact fields from""" + # Find all policies not explicitly allowed. + named_policies = set(self.redaction_policies.keys()) + redacted_policies = named_policies - self.allowed_policies + for policy in redacted_policies: + for property in self.redaction_policies[policy]: + _nested_pop(data, property) + + def process(self, data: dict) -> None: + """Validate event data and enforce an redaction policies.""" + self.validate(data) + self.enforce_redaction_policy(data) diff --git a/jupyter_events/schema_registry.py b/jupyter_events/schema_registry.py new file mode 100644 index 0000000..5e22ede --- /dev/null +++ b/jupyter_events/schema_registry.py @@ -0,0 +1,62 @@ +from multiprocessing import Event + +from .schema import EventSchema + + +class SchemaRegistryException(Exception): + pass + + +class SchemaRegistry: + def __init__(self, schemas={}, allowed_policies="all"): + self._schemas = schemas + self._allowed_policies = allowed_policies + + @property + def allowed_policies(self): + return self._allowed_policies + + def __contains__(self, registry_key): + """Syntax sugar to check if a schema is found in the registry""" + return registry_key in self._schemas + + def _add(self, schema_obj: EventSchema): + if schema_obj.registry_key in self._schemas: + raise SchemaRegistryException( + f"The schema, {schema_obj.id} " + f"(version {schema_obj.version}), is already " + "registered. Try removing it and registering it again." + ) + self._schemas[schema_obj.registry_key] = schema_obj + + def register(self, schema_data): + """Register a schema.""" + schema = EventSchema(schema_data, allowed_policies=self.allowed_policies) + self._add(schema) + + def register_from_file(self, schema_filepath): + """Register a schema from a file.""" + schema = EventSchema.from_file( + schema_filepath, allowed_policies=self.allowed_policies + ) + self._add(schema) + + def get(self, registry_key) -> EventSchema: + try: + return self._schemas[registry_key] + except KeyError: + raise KeyError( + f"The requested schema, {registry_key[0]} " + f"(version {registry_key[1]}), was not found in the " + "schema registry. Are you sure it was previously registered?" + ) + + def remove(self, registry_key): + try: + del self._schemas[registry_key] + except KeyError: + raise KeyError( + f"The requested schema, {registry_key[0]} " + f"(version {registry_key[1]}), was not found in the " + "schema registry. Are you sure it was previously registered?" + ) diff --git a/jupyter_events/schemas/event-metaschema.yml b/jupyter_events/schemas/event-metaschema.yml new file mode 100644 index 0000000..b5e9250 --- /dev/null +++ b/jupyter_events/schemas/event-metaschema.yml @@ -0,0 +1,31 @@ +$schema: http://json-schema.org/draft-07/schema +$id: http://event.jupyter.org/event-metaschema +version: 1 +title: Event Metaschema +description: | + A meta schema for validating that all registered Jupyter Event + schemas are appropriately defined. +type: object +properties: + $id: + type: string + version: + type: integer + title: + type: string + description: + type: string + redactionPolicies: + type: array + items: + type: string + properties: + type: object + additionalProperties: + $ref: http://event.jupyter.org/property-metaschema + +required: + - $id + - version + - redactionPolicy + - properties diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml new file mode 100644 index 0000000..0a5f318 --- /dev/null +++ b/jupyter_events/schemas/property-metaschema.yml @@ -0,0 +1,20 @@ +$schema: http://json-schema.org/draft-07/schema +$id: http://event.jupyter.org/property-metaschema +version: 1 +title: Property Metaschema +description: | + A metaschema for validating properties within + an event schema +properties: + title: + type: string + description: + type: string + redactionPolicies: + type: array + items: + type: string + additionalProperties: true +required: + - title + - redactionPolicy diff --git a/jupyter_events/yaml.py b/jupyter_events/yaml.py new file mode 100644 index 0000000..4f839cc --- /dev/null +++ b/jupyter_events/yaml.py @@ -0,0 +1,21 @@ +try: + from ruamel.yaml import YAML +except ImportError as e: + # check for known conda bug that prevents + # pip from installing ruamel.yaml dependency + try: + import ruamel_yaml # noqa + except ImportError: + # nope, regular import error; raise original + raise e + else: + # have conda fork ruamel_yaml, but not ruamel.yaml. + # this is a bug in the ruamel_yaml conda package + # mistakenly identifying itself as ruamel.yaml to pip. + # conda install the 'real' ruamel.yaml to fix + raise ImportError( + "Missing dependency ruamel.yaml. Try: `conda install ruamel.yaml`" + ) + + +yaml = YAML(typ="safe") diff --git a/tests/test_category_filtering.py b/tests/test_category_filtering.py index 55d1197..a7b3b1c 100644 --- a/tests/test_category_filtering.py +++ b/tests/test_category_filtering.py @@ -164,7 +164,7 @@ def test_category_filtering(allowed_schemas, expected_output): [ ( # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": []}}, + {SCHEMA_ID: {"allowed_policies": []}}, # Expected properties in the recorded event {"nothing-exciting": "hello, world", "users": None}, ), diff --git a/tests/test_eventlog.py b/tests/test_eventlog.py deleted file mode 100644 index 0e77fd3..0000000 --- a/tests/test_eventlog.py +++ /dev/null @@ -1,52 +0,0 @@ -import logging - -import pytest -from traitlets import TraitError -from traitlets.config.loader import PyFileConfigLoader - -from jupyter_events.logger import EventLogger - -GOOD_CONFIG = """ -import logging - -c.EventLogger.handlers = [ - logging.StreamHandler() -] -""" - -BAD_CONFIG = """ -import logging - -c.EventLogger.handlers = [ - 0 -] -""" - - -def get_config_from_file(path, content): - # Write config file - filename = "config.py" - config_file = path / filename - config_file.write_text(content) - - # Load written file. - loader = PyFileConfigLoader(filename, path=str(path)) - cfg = loader.load_config() - return cfg - - -def test_good_config_file(tmp_path): - cfg = get_config_from_file(tmp_path, GOOD_CONFIG) - - # Pass config to EventLogger - e = EventLogger(config=cfg) - - assert len(e.handlers) > 0 - assert isinstance(e.handlers[0], logging.Handler) - - -def test_bad_config_file(tmp_path): - cfg = get_config_from_file(tmp_path, BAD_CONFIG) - - with pytest.raises(TraitError): - EventLogger(config=cfg) diff --git a/tests/test_register_schema.py b/tests/test_register_schema.py index ef64175..3edb135 100644 --- a/tests/test_register_schema.py +++ b/tests/test_register_schema.py @@ -5,6 +5,7 @@ import jsonschema import pytest +from jsonschema.exceptions import ValidationError from ruamel.yaml import YAML from jupyter_events.logger import EventLogger @@ -15,7 +16,7 @@ def test_register_invalid_schema(): Invalid JSON Schemas should fail registration """ el = EventLogger() - with pytest.raises(jsonschema.SchemaError): + with pytest.raises(ValidationError): el.register_schema( { # Totally invalid @@ -31,10 +32,10 @@ def test_missing_required_properties(): They aren't required by JSON Schema itself """ el = EventLogger() - with pytest.raises(ValueError): + with pytest.raises(ValidationError): el.register_schema({"properties": {}}) - with pytest.raises(ValueError): + with pytest.raises(ValidationError): el.register_schema( { "$id": "something", @@ -50,13 +51,19 @@ def test_reserved_properties(): These are reserved """ el = EventLogger() - with pytest.raises(ValueError): + with pytest.raises(ValidationError): el.register_schema( { "$id": "test/test", + "title": "Test", "version": 1, + "redactionPolicy": ["unrestricted"], "properties": { - "__fail__": {"type": "string", "categories": ["unrestricted"]}, + "__fail__": { + "type": "string", + "title": "test", + "redactionPolicy": ["unrestricted"], + }, }, } ) @@ -67,10 +74,15 @@ def test_timestamp_override(): Simple test for overriding timestamp """ schema = { - "$id": "test/test", + "$id": "test/test2", "version": 1, + "redactionPolicy": ["unrestricted"], "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicy": ["unrestricted"], + }, }, } @@ -78,33 +90,30 @@ def test_timestamp_override(): handler = logging.StreamHandler(output) el = EventLogger(handlers=[handler]) el.register_schema(schema) - el.allowed_schemas = ["test/test"] timestamp_override = datetime.utcnow() - timedelta(days=1) - el.record_event( - "test/test", - 1, - { - "something": "blah", - }, - timestamp_override=timestamp_override, + el.emit( + "test/test", 1, {"something": "blah"}, timestamp_override=timestamp_override ) handler.flush() - event_capsule = json.loads(output.getvalue()) - assert event_capsule["__timestamp__"] == timestamp_override.isoformat() + "Z" -def test_record_event(): +def test_emit(): """ Simple test for emitting valid events """ schema = { - "$id": "test/test", + "$id": "test/test3", "version": 1, + "redactionPolicy": ["unrestricted"], "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicy": ["unrestricted"], + }, }, } @@ -112,9 +121,8 @@ def test_record_event(): handler = logging.StreamHandler(output) el = EventLogger(handlers=[handler]) el.register_schema(schema) - el.allowed_schemas = ["test/test"] - el.record_event( + el.emit( "test/test", 1, { @@ -141,22 +149,25 @@ def test_register_schema_file(tmp_path): Register schema from a file """ schema = { - "$id": "test/test", + "$id": "test/test3", "version": 1, + "redactionPolicy": ["unrestricted"], + "type": "object", "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicy": ["unrestricted"], + }, }, } el = EventLogger() - yaml = YAML(typ="safe") - schema_file = tmp_path.joinpath("schema.yml") yaml.dump(schema, schema_file) - el.register_schema_file(str(schema_file)) - - assert schema in el.schemas.values() + el.register_schema_file(schema_file) + assert ("test/test3", 1) in el.schema_registry def test_register_schema_file_object(tmp_path): @@ -177,6 +188,7 @@ def test_register_schema_file_object(tmp_path): schema_file = tmp_path.joinpath("schema.yml") yaml.dump(schema, schema_file) + with open(str(schema_file)) as f: el.register_schema_file(f) diff --git a/tests/utils.py b/tests/utils.py index 0c2362c..20bac8c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -6,17 +6,17 @@ from jupyter_events.logger import EventLogger -def get_event_data(event, schema, schema_id, version, allowed_schemas): +def get_event_data(event, schema, schema_id, version, allowed_policies): sink = io.StringIO() # Create a handler that captures+records events with allowed tags. handler = logging.StreamHandler(sink) - e = EventLogger(handlers=[handler], allowed_schemas=allowed_schemas) + e = EventLogger(handlers=[handler], allowed_policies=allowed_policies) e.register_schema(schema) # Record event and read output - e.record_event(schema_id, version, deepcopy(event)) + e.emit(schema_id, version, deepcopy(event)) recorded_event = json.loads(sink.getvalue()) return { From 91a9501bb9a4c7346197252dbea7bd067fb97cbd Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 14 Jul 2022 15:06:04 -0500 Subject: [PATCH 02/14] working on tests --- jupyter_events/logger.py | 7 +- jupyter_events/schema.py | 6 +- jupyter_events/schema_registry.py | 4 +- jupyter_events/schemas/event-metaschema.yml | 2 +- .../schemas/property-metaschema.yml | 2 +- tests/test_register_schema.py | 105 +++++++++++++----- 6 files changed, 88 insertions(+), 38 deletions(-) diff --git a/jupyter_events/logger.py b/jupyter_events/logger.py index 34e7764..7d62f5e 100644 --- a/jupyter_events/logger.py +++ b/jupyter_events/logger.py @@ -6,7 +6,7 @@ from datetime import datetime from pythonjsonlogger import jsonlogger -from traitlets import Instance, List +from traitlets import Instance, List, default from traitlets.config import Config, Configurable from . import EVENTS_METADATA_VERSION @@ -40,11 +40,14 @@ class EventLogger(Configurable): schema_registry = Instance(SchemaRegistry) + @default("schema_registry") + def _default_schema_registry(self): + return SchemaRegistry(allowed_policies=self.allowed_policies) + def __init__(self, *args, **kwargs): # We need to initialize the configurable before # adding the logging handlers. super().__init__(*args, **kwargs) - self.schema_registry = SchemaRegistry(allowed_policies=self.allowed_policies) # Use a unique name for the logger so that multiple instances of EventLog do not write # to each other's handlers. log_name = __name__ + "." + str(id(self)) diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py index 19b69ce..1e28a5f 100644 --- a/jupyter_events/schema.py +++ b/jupyter_events/schema.py @@ -32,7 +32,7 @@ def _extract_policies(subschema, key_sequence=()): _extract_policies(obj, updated_key_sequence) # Update the list in place. - for policy in obj["redactionPolicy"]: + for policy in obj["redactionPolicies"]: policies_list = redaction_policies.get(policy, []) policies_list.append(updated_key_sequence) redaction_policies[policy] = policies_list @@ -163,7 +163,7 @@ def validate(self, data: dict) -> None: """Validate an incoming instance of this event schema.""" self._validator.validate(data) - def enforce_redaction_policy(self, data: dict) -> None: + def enforce_redaction_policies(self, data: dict) -> None: """Redact fields from""" # Find all policies not explicitly allowed. named_policies = set(self.redaction_policies.keys()) @@ -175,4 +175,4 @@ def enforce_redaction_policy(self, data: dict) -> None: def process(self, data: dict) -> None: """Validate event data and enforce an redaction policies.""" self.validate(data) - self.enforce_redaction_policy(data) + self.enforce_redaction_policies(data) diff --git a/jupyter_events/schema_registry.py b/jupyter_events/schema_registry.py index 5e22ede..b48f164 100644 --- a/jupyter_events/schema_registry.py +++ b/jupyter_events/schema_registry.py @@ -8,8 +8,8 @@ class SchemaRegistryException(Exception): class SchemaRegistry: - def __init__(self, schemas={}, allowed_policies="all"): - self._schemas = schemas + def __init__(self, schemas=None, allowed_policies="all"): + self._schemas = schemas or {} self._allowed_policies = allowed_policies @property diff --git a/jupyter_events/schemas/event-metaschema.yml b/jupyter_events/schemas/event-metaschema.yml index b5e9250..2ce1b6e 100644 --- a/jupyter_events/schemas/event-metaschema.yml +++ b/jupyter_events/schemas/event-metaschema.yml @@ -27,5 +27,5 @@ properties: required: - $id - version - - redactionPolicy + - redactionPolicies - properties diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml index 0a5f318..75270e5 100644 --- a/jupyter_events/schemas/property-metaschema.yml +++ b/jupyter_events/schemas/property-metaschema.yml @@ -17,4 +17,4 @@ properties: additionalProperties: true required: - title - - redactionPolicy + - redactionPolicies diff --git a/tests/test_register_schema.py b/tests/test_register_schema.py index 3edb135..3a00de4 100644 --- a/tests/test_register_schema.py +++ b/tests/test_register_schema.py @@ -57,31 +57,36 @@ def test_reserved_properties(): "$id": "test/test", "title": "Test", "version": 1, - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], "properties": { "__fail__": { "type": "string", "title": "test", - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], }, }, } ) +@pytest.fixture(autouse=True) +def resetter(): + pass + + def test_timestamp_override(): """ Simple test for overriding timestamp """ schema = { - "$id": "test/test2", + "$id": "test/test", "version": 1, - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], "properties": { "something": { "type": "string", "title": "test", - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], }, }, } @@ -105,14 +110,14 @@ def test_emit(): Simple test for emitting valid events """ schema = { - "$id": "test/test3", + "$id": "test/test", "version": 1, - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], "properties": { "something": { "type": "string", "title": "test", - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], }, }, } @@ -149,15 +154,15 @@ def test_register_schema_file(tmp_path): Register schema from a file """ schema = { - "$id": "test/test3", + "$id": "test/test", "version": 1, - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicy": ["unrestricted"], + "redactionPolicies": ["unrestricted"], }, }, } @@ -177,8 +182,14 @@ def test_register_schema_file_object(tmp_path): schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, }, } @@ -202,18 +213,23 @@ def test_allowed_schemas(): schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, }, } - output = io.StringIO() handler = logging.StreamHandler(output) el = EventLogger(handlers=[handler]) # Just register schema, but do not mark it as allowed el.register_schema(schema) - el.record_event( + el.emit( "test/test", 1, { @@ -225,16 +241,25 @@ def test_allowed_schemas(): assert output.getvalue() == "" -def test_record_event_badschema(): +def test_emit_badschema(): """ Fail fast when an event doesn't conform to its schema """ schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, - "status": {"enum": ["success", "failure"], "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + "status": { + "enum": ["success", "failure"], + "redactionPolicies": ["unrestricted"], + }, }, } @@ -243,25 +268,35 @@ def test_record_event_badschema(): el.allowed_schemas = ["test/test"] with pytest.raises(jsonschema.ValidationError): - el.record_event( - "test/test", 1, {"something": "blah", "status": "hi"} # 'not-in-enum' - ) + el.emit("test/test", 1, {"something": "blah", "status": "hi"}) # 'not-in-enum' def test_unique_logger_instances(): schema0 = { "$id": "test/test0", "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, }, } schema1 = { "$id": "test/test1", "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, }, } @@ -278,14 +313,14 @@ def test_unique_logger_instances(): el1.register_schema(schema1) el1.allowed_schemas = ["test/test1"] - el0.record_event( + el0.emit( "test/test0", 1, { "something": "blah", }, ) - el1.record_event( + el1.emit( "test/test1", 1, { @@ -322,18 +357,30 @@ def test_unique_logger_instances(): def test_register_duplicate_schemas(): schema0 = { - "$id": "test/test", + "$id": "test/test0", "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", "properties": { - "something": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, }, } schema1 = { - "$id": "test/test", + "$id": "test/test1", "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", "properties": { - "somethingelse": {"type": "string", "categories": ["unrestricted"]}, + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, }, } From 9c36c10274eb64cd3151329c9b4502501b3c5a69 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Tue, 19 Jul 2022 16:03:38 -0700 Subject: [PATCH 03/14] working unit tests --- jupyter_events/logger.py | 7 +- jupyter_events/schema.py | 114 +++-- jupyter_events/schema_registry.py | 14 +- .../schemas/property-metaschema.yml | 6 + .../schemas/reserved-property-metaschema.yml | 17 + jupyter_events/validators.py | 26 ++ .../schemas/bad/missing-parent-policies.yaml | 17 + tests/schemas/bad/missing-policy-array.yaml | 30 ++ .../bad/missing-policy-nested-array.yaml | 47 ++ tests/schemas/good/array.yaml | 32 ++ tests/schemas/good/basic.yaml | 16 + tests/schemas/good/nested-array.yaml | 48 ++ tests/test_allowed_schemas.py | 199 -------- tests/test_category_filtering.py | 440 ------------------ tests/test_logger.py | 52 +++ tests/test_redaction.py | 66 +++ tests/test_schema.py | 51 ++ ...ster_schema.py => test_schema_registry.py} | 98 ++-- tests/utils.py | 7 +- 19 files changed, 517 insertions(+), 770 deletions(-) create mode 100644 jupyter_events/schemas/reserved-property-metaschema.yml create mode 100644 jupyter_events/validators.py create mode 100644 tests/schemas/bad/missing-parent-policies.yaml create mode 100644 tests/schemas/bad/missing-policy-array.yaml create mode 100644 tests/schemas/bad/missing-policy-nested-array.yaml create mode 100644 tests/schemas/good/array.yaml create mode 100644 tests/schemas/good/basic.yaml create mode 100644 tests/schemas/good/nested-array.yaml delete mode 100644 tests/test_allowed_schemas.py delete mode 100644 tests/test_category_filtering.py create mode 100644 tests/test_logger.py create mode 100644 tests/test_redaction.py create mode 100644 tests/test_schema.py rename tests/{test_register_schema.py => test_schema_registry.py} (82%) diff --git a/jupyter_events/logger.py b/jupyter_events/logger.py index 7d62f5e..8f425c9 100644 --- a/jupyter_events/logger.py +++ b/jupyter_events/logger.py @@ -28,8 +28,9 @@ class EventLogger(Configurable): """, ).tag(config=True) - allowed_policies = List( - default_value=["all"], + redacted_policies = List( + default_value=None, + allow_none=True, help=( """ A list of the redaction policies that will not be redacted @@ -42,7 +43,7 @@ class EventLogger(Configurable): @default("schema_registry") def _default_schema_registry(self): - return SchemaRegistry(allowed_policies=self.allowed_policies) + return SchemaRegistry(redacted_policies=self.redacted_policies) def __init__(self, *args, **kwargs): # We need to initialize the configurable before diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py index 1e28a5f..08272fb 100644 --- a/jupyter_events/schema.py +++ b/jupyter_events/schema.py @@ -1,23 +1,37 @@ -import pathlib from typing import Any, Dict, Hashable, List, Sequence, Union -from jsonschema import RefResolver, validators +from jsonschema import validators +from .validators import JUPYTER_EVENTS_VALIDATOR from .yaml import yaml -def _nested_pop(dictionary: dict, nested_keys: Sequence[Hashable]) -> Any: +def _pop_nested_redacted_fields( + schema_data: dict, policy_location: Sequence[Hashable] +) -> Any: """Pop a item nested anywhere in a dwictionary using the list of (hashable) keys to locate the item. """ - d = dictionary - last_entry = nested_keys[-1] - for key in nested_keys[:-1]: - d = d[key] - return d.pop(last_entry) - - -def _get_redaction_policies(schema: dict): + # Begin walking the sequence of keys to the policy + # location given. + nested_data = schema_data + for i, el in enumerate(policy_location[:-1]): + # Handle arrays of objects. + if el == "__array__": + for j, _ in enumerate(nested_data): + branch = policy_location[i + 1 :] + _pop_nested_redacted_fields(nested_data[j], branch) + return + # Try moving into nested child schema. + try: + nested_data = nested_data[el] + except KeyError: + return + # If we made it this far, we ended on a policy that needs to be popped. + return nested_data.pop(policy_location[-1]) + + +def _find_redaction_policies(schema: dict): """A recursive function that iterates an event schema and returns a mapping of redaction policies to (nested) properties (identified by a sequence of keys). @@ -28,8 +42,17 @@ def _extract_policies(subschema, key_sequence=()): props = subschema["properties"] for key, obj in props.items(): updated_key_sequence = key_sequence + (key,) - if isinstance(obj, dict) and "properties" in obj: - _extract_policies(obj, updated_key_sequence) + + def _nested_extract_policies(obj, updated_key_sequence): + if isinstance(obj, dict): + if "properties" in obj: + _extract_policies(obj, updated_key_sequence) + if "items" in obj and "properties" in obj["items"]: + _nested_extract_policies( + obj["items"], updated_key_sequence + ("__array__",) + ) + + _nested_extract_policies(obj, updated_key_sequence) # Update the list in place. for policy in obj["redactionPolicies"]: @@ -42,21 +65,6 @@ def _extract_policies(subschema, key_sequence=()): return redaction_policies -METASCHEMA_PATH = pathlib.Path(__file__).parent.joinpath("schemas") -EVENT_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("event-metaschema.yml") -EVENT_METASCHEMA = yaml.load(EVENT_METASCHEMA_FILEPATH) -PROPERTY_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("property-metaschema.yml") -PROPERTY_METASCHEMA = yaml.load(PROPERTY_METASCHEMA_FILEPATH) -METASCHEMA_RESOLVER = RefResolver( - base_uri=EVENT_METASCHEMA["$id"], - referrer=EVENT_METASCHEMA, - store={PROPERTY_METASCHEMA["$id"]: PROPERTY_METASCHEMA}, -) -METASCHEMA_VALIDATOR = validators.Draft7Validator( - EVENT_METASCHEMA, resolver=METASCHEMA_RESOLVER -) - - class EventSchema: """A validated schema that can be used. @@ -93,25 +101,27 @@ def __init__( schema, validator_class=validators.Draft7Validator, resolver=None, - allowed_policies: Union[str, list] = "all", + redacted_policies: Union[str, list, None] = None, ): # Validate the schema against Jupyter Events metaschema. - METASCHEMA_VALIDATOR.validate(schema) + JUPYTER_EVENTS_VALIDATOR.validate(schema) # Build a mapping of all property redaction policies. - self._redaction_policies = _get_redaction_policies(schema) - self._allowed_policies = self._validate_allowed_policies(allowed_policies) + self._redaction_policies_locations = _find_redaction_policies(schema) + self._redacted_policies = self._validate_redacted_policies(redacted_policies) # Create a validator for this schema self._validator = validator_class(schema, resolver=resolver) self._schema = schema - def _validate_allowed_policies(self, allowed_policies): - value_type = type(allowed_policies) - if value_type == str and allowed_policies == "all": - return set(self.redaction_policies.keys()) - elif value_type == list: - return set(["unrestricted"] + list(allowed_policies)) + def _validate_redacted_policies(self, redacted_policies): + if redacted_policies is None: + return set() + value_type = type(redacted_policies) + if value_type == str and redacted_policies == "all": + return set(self.redaction_policies_locations.keys()) + if value_type == list: + return set(redacted_policies) raise TypeError( - "allowed_policies must be the literal string, 'all', or a list of " + "redacted_policies must be the literal string, 'all', or a list of " "redaction polices" ) @@ -130,11 +140,11 @@ def registry_key(self): return (self.id, self.version) @property - def allowed_policies(self): + def redacted_policies(self): """The redaction policies that will not be redacted when an incoming event is processed. """ - return self._allowed_policies + return self._redacted_policies @classmethod def from_file( @@ -142,22 +152,22 @@ def from_file( filepath, validator_class=validators.Draft7Validator, resolver=None, - allowed_policies="all", + redacted_policies=None, ): schema = yaml.load(filepath) return cls( schema=schema, validator_class=validator_class, resolver=resolver, - allowed_policies=allowed_policies, + redacted_policies=redacted_policies, ) @property - def redaction_policies(self) -> Dict[str, List[str]]: + def redaction_policies_locations(self) -> Dict[str, List[str]]: """Mapping of the redaction policies in this schema to the (nested) properties where they are defined. """ - return self._redaction_policies + return self._redaction_policies_locations def validate(self, data: dict) -> None: """Validate an incoming instance of this event schema.""" @@ -165,12 +175,14 @@ def validate(self, data: dict) -> None: def enforce_redaction_policies(self, data: dict) -> None: """Redact fields from""" - # Find all policies not explicitly allowed. - named_policies = set(self.redaction_policies.keys()) - redacted_policies = named_policies - self.allowed_policies - for policy in redacted_policies: - for property in self.redaction_policies[policy]: - _nested_pop(data, property) + # # Find all policies not explicitly allowed. + # named_policies = set(self.redaction_policies_locations.keys()) + # redacted_policies = named_policies - self.unredacted_policies + for policy_type in self.redacted_policies: + policy_locations = self._redaction_policies_locations[policy_type] + print(policy_type, policy_locations) + for item in policy_locations: + _pop_nested_redacted_fields(data, item) def process(self, data: dict) -> None: """Validate event data and enforce an redaction policies.""" diff --git a/jupyter_events/schema_registry.py b/jupyter_events/schema_registry.py index b48f164..0f070ea 100644 --- a/jupyter_events/schema_registry.py +++ b/jupyter_events/schema_registry.py @@ -4,17 +4,17 @@ class SchemaRegistryException(Exception): - pass + """Exception class for Jupyter Events Schema Registry Errors.""" class SchemaRegistry: - def __init__(self, schemas=None, allowed_policies="all"): + def __init__(self, schemas=None, redacted_policies=None): self._schemas = schemas or {} - self._allowed_policies = allowed_policies + self._redacted_policies = redacted_policies @property - def allowed_policies(self): - return self._allowed_policies + def redacted_policies(self): + return self._redacted_policies def __contains__(self, registry_key): """Syntax sugar to check if a schema is found in the registry""" @@ -31,13 +31,13 @@ def _add(self, schema_obj: EventSchema): def register(self, schema_data): """Register a schema.""" - schema = EventSchema(schema_data, allowed_policies=self.allowed_policies) + schema = EventSchema(schema_data, redacted_policies=self.redacted_policies) self._add(schema) def register_from_file(self, schema_filepath): """Register a schema from a file.""" schema = EventSchema.from_file( - schema_filepath, allowed_policies=self.allowed_policies + schema_filepath, redacted_policies=self.redacted_policies ) self._add(schema) diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml index 75270e5..f61ef35 100644 --- a/jupyter_events/schemas/property-metaschema.yml +++ b/jupyter_events/schemas/property-metaschema.yml @@ -14,6 +14,12 @@ properties: type: array items: type: string + properties: + type: object + additionalProperties: + $ref: http://event.jupyter.org/property-metaschema + items: + $ref: http://event.jupyter.org/property-metaschema additionalProperties: true required: - title diff --git a/jupyter_events/schemas/reserved-property-metaschema.yml b/jupyter_events/schemas/reserved-property-metaschema.yml new file mode 100644 index 0000000..42411b7 --- /dev/null +++ b/jupyter_events/schemas/reserved-property-metaschema.yml @@ -0,0 +1,17 @@ +$schema: http://json-schema.org/draft-07/schema +$id: http://event.jupyter.org/reserved-property-metaschema +version: 1 +title: Reserved Property Metaschema +description: | + Property names that are reserved for Jupyter Events and should not + be included in an Event Schema. +patternProperties: + ^__: + type: + - string + - object + - number + - integer + - boolean + - array +minProperties: 1 diff --git a/jupyter_events/validators.py b/jupyter_events/validators.py new file mode 100644 index 0000000..60443d5 --- /dev/null +++ b/jupyter_events/validators.py @@ -0,0 +1,26 @@ +import pathlib + +from jsonschema import RefResolver, validators + +from .yaml import yaml + +METASCHEMA_PATH = pathlib.Path(__file__).parent.joinpath("schemas") +EVENT_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("event-metaschema.yml") +EVENT_METASCHEMA = yaml.load(EVENT_METASCHEMA_FILEPATH) +PROPERTY_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("property-metaschema.yml") +PROPERTY_METASCHEMA = yaml.load(PROPERTY_METASCHEMA_FILEPATH) +RESERVED_PROPERTY_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath( + "reserved-property-metaschema.yml" +) +RESERVED_PROPERTY_METASCHEMA = yaml.load(RESERVED_PROPERTY_METASCHEMA_FILEPATH) +METASCHEMA_RESOLVER = RefResolver( + base_uri=EVENT_METASCHEMA["$id"], + referrer=EVENT_METASCHEMA, + store={ + PROPERTY_METASCHEMA["$id"]: PROPERTY_METASCHEMA, + RESERVED_PROPERTY_METASCHEMA["$id"]: RESERVED_PROPERTY_METASCHEMA, + }, +) +JUPYTER_EVENTS_VALIDATOR = validators.Draft7Validator( + EVENT_METASCHEMA, resolver=METASCHEMA_RESOLVER +) diff --git a/tests/schemas/bad/missing-parent-policies.yaml b/tests/schemas/bad/missing-parent-policies.yaml new file mode 100644 index 0000000..bbf0c8b --- /dev/null +++ b/tests/schemas/bad/missing-parent-policies.yaml @@ -0,0 +1,17 @@ +$id: http://event.jupyter.org/test-simple +version: 1 +title: Simple Test Schema +description: | + Fails validation because the root level of this schema + is missing redactionPolicies. +type: object +properties: + prop1: + title: Test Property 1 + description: | + Test property 1. + redactionPolicies: + - unrestricted + type: string +required: + - prop1 diff --git a/tests/schemas/bad/missing-policy-array.yaml b/tests/schemas/bad/missing-policy-array.yaml new file mode 100644 index 0000000..a6897f9 --- /dev/null +++ b/tests/schemas/bad/missing-policy-array.yaml @@ -0,0 +1,30 @@ +$id: http://event.jupyter.org/test +version: 1 +title: Schema with Array +description: | + A schema for an array of objects. +type: object +redactionPolicies: + - unrestricted +properties: + users: + title: Test User Array + description: | + Test User array. + redactionPolicies: + - unrestricted + type: array + items: + type: object + title: User + redactionPolicies: + - unrestricted + properties: + email: + type: string + title: Email + id: + type: string + title: Name + redactionPolicies: + - user-identifier diff --git a/tests/schemas/bad/missing-policy-nested-array.yaml b/tests/schemas/bad/missing-policy-nested-array.yaml new file mode 100644 index 0000000..e76b4ad --- /dev/null +++ b/tests/schemas/bad/missing-policy-nested-array.yaml @@ -0,0 +1,47 @@ +$id: http://event.jupyter.org/test +version: 1 +title: Schema with Array +description: | + Fails validation because an element in the nested array + doesn't have a redactionPolicies field (see "position"). +type: object +redactionPolicies: + - unrestricted +properties: + users: + title: Test User Array + description: | + Test User array. + redactionPolicies: + - unrestricted + type: array + items: + type: object + title: User + redactionPolicies: + - unrestricted + properties: + name: + type: string + title: Name + redactionPolicies: + - user-identifier + hobbies: + type: array + title: Hobbies + redactionPolicies: + - unrestricted + items: + type: object + title: Hobby + redactionPolicies: + - unrestricted + properties: + sport: + title: Sport Name + type: string + redactionPolicies: + - unrestricted + position: + title: Position + type: string diff --git a/tests/schemas/good/array.yaml b/tests/schemas/good/array.yaml new file mode 100644 index 0000000..336af90 --- /dev/null +++ b/tests/schemas/good/array.yaml @@ -0,0 +1,32 @@ +$id: http://event.jupyter.org/test +version: 1 +title: Schema with Array +description: | + A schema for an array of objects. +type: object +redactionPolicies: + - unrestricted +properties: + users: + title: Test User Array + description: | + Test User array. + redactionPolicies: + - unrestricted + type: array + items: + type: object + title: User + redactionPolicies: + - unrestricted + properties: + email: + type: string + title: Email + redactionPolicies: + - user-identifiable-information + id: + type: string + title: Name + redactionPolicies: + - user-identifier diff --git a/tests/schemas/good/basic.yaml b/tests/schemas/good/basic.yaml new file mode 100644 index 0000000..6511819 --- /dev/null +++ b/tests/schemas/good/basic.yaml @@ -0,0 +1,16 @@ +$id: http://event.jupyter.org/test +version: 1 +title: Simple Test Schema +description: | + A simple schema for testing +type: object +redactionPolicies: + - unrestricted +properties: + prop: + title: Test Property + description: | + Test property. + redactionPolicies: + - unrestricted + type: string diff --git a/tests/schemas/good/nested-array.yaml b/tests/schemas/good/nested-array.yaml new file mode 100644 index 0000000..61bc86d --- /dev/null +++ b/tests/schemas/good/nested-array.yaml @@ -0,0 +1,48 @@ +$id: http://event.jupyter.org/test +version: 1 +title: Schema with Array +description: | + A schema for an array of objects. +type: object +redactionPolicies: + - unrestricted +properties: + users: + title: Test User Array + description: | + Test User array. + redactionPolicies: + - unrestricted + type: array + items: + type: object + title: User + redactionPolicies: + - unrestricted + properties: + name: + type: string + title: Name + redactionPolicies: + - user-identifier + hobbies: + type: array + title: Hobbies + redactionPolicies: + - unrestricted + items: + type: object + title: Hobby + redactionPolicies: + - unrestricted + properties: + sport: + title: Sport Name + type: string + redactionPolicies: + - unrestricted + position: + title: Position + type: string + redactionPolicies: + - user-identifiable-information diff --git a/tests/test_allowed_schemas.py b/tests/test_allowed_schemas.py deleted file mode 100644 index 7fba879..0000000 --- a/tests/test_allowed_schemas.py +++ /dev/null @@ -1,199 +0,0 @@ -from textwrap import dedent as _ - -import pytest -from ruamel.yaml import YAML - -from jupyter_events.logger import EventLogger - -from .utils import get_event_data - -SCHEMA_ID = "test.event" -VERSION = 1 - - -@pytest.fixture -def schema(): - return { - "$id": SCHEMA_ID, - "title": "Test Event", - "version": VERSION, - "description": "Test Event.", - "type": "object", - "properties": { - "nothing-exciting": { - "description": "a property with nothing exciting happening", - "categories": ["unrestricted"], - "type": "string", - }, - "id": { - "description": "user ID", - "categories": ["user-identifier"], - "type": "string", - }, - "email": { - "description": "email address", - "categories": ["user-identifiable-information"], - "type": "string", - }, - }, - } - - -def test_raised_exception_for_nonlist_categories(): - # Bad schema in yaml form. - yaml_schema = _( - """\ - $id: test.schema - title: Test Event - version: 1 - type: object - properties: - test_property: - description: testing a property - categories: user-identifier - type: string - """ - ) - yaml = YAML(typ="safe") - schema = yaml.load(yaml_schema) - - # Register schema with an EventLogger - e = EventLogger( - allowed_schemas={SCHEMA_ID: {"allowed_categories": ["user-identifier"]}}, - ) - - # This schema does not have categories as a list. - with pytest.raises(ValueError) as err: - e.register_schema(schema) - # Verify that the error message is the expected error message. - assert "must be a list." in str(err.value) - - -def test_missing_categories_label(): - # Bad schema in yaml form. - yaml_schema = _( - """\ - $id: test.schema - title: Test Event - version: 1 - type: object - properties: - test_property: - description: testing a property - type: string - """ - ) - yaml = YAML(typ="safe") - schema = yaml.load(yaml_schema) - - # Register schema with an EventLogger - e = EventLogger( - allowed_schemas={SCHEMA_ID: {"allowed_categories": ["random-category"]}} - ) - - # This schema does not have categories as a list. - with pytest.raises(KeyError) as err: - e.register_schema(schema) - # Verify that the error message is the expected error message. - assert 'All properties must have a "categories"' in str(err.value) - - -EVENT_DATA = { - "nothing-exciting": "hello, world", - "id": "test id", - "email": "test@testemail.com", -} - - -@pytest.mark.parametrize( - "allowed_schemas,expected_output", - [ - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": []}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "id": None, - "email": None, - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["unrestricted"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "id": None, - "email": None, - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifier"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "id": "test id", - "email": None, - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifiable-information"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "id": None, - "email": "test@testemail.com", - }, - ), - ( - # User configuration for allowed_schemas - { - SCHEMA_ID: { - "allowed_categories": [ - "user-identifier", - "user-identifiable-information", - ] - } - }, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "id": "test id", - "email": "test@testemail.com", - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_properties": ["id"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "id": "test id", - "email": None, - }, - ), - ( - # User configuration for allowed_schemas - { - SCHEMA_ID: { - "allowed_properties": ["id"], - "allowed_categories": ["user-identifiable-information"], - } - }, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "id": "test id", - "email": "test@testemail.com", - }, - ), - ], -) -def test_allowed_schemas(schema, allowed_schemas, expected_output): - event_data = get_event_data(EVENT_DATA, schema, SCHEMA_ID, VERSION, allowed_schemas) - - # Verify that *exactly* the right properties are recorded. - assert expected_output == event_data diff --git a/tests/test_category_filtering.py b/tests/test_category_filtering.py deleted file mode 100644 index a7b3b1c..0000000 --- a/tests/test_category_filtering.py +++ /dev/null @@ -1,440 +0,0 @@ -import pytest - -from .utils import get_event_data - -SCHEMA_ID = "test.event" -VERSION = 1 - - -NESTED_CATEGORY_SCHEMA = { - "$id": SCHEMA_ID, - "title": "Test Event", - "version": VERSION, - "description": "Test Event.", - "type": "object", - "properties": { - "nothing-exciting": { - "description": "a property with nothing exciting happening", - "categories": ["unrestricted"], - "type": "string", - }, - "user": { - "description": "user", - "categories": ["user-identifier"], - "type": "object", - "properties": { - "email": { - "description": "email address", - "categories": ["user-identifiable-information"], - "type": "string", - }, - "id": {"description": "user ID", "type": "string"}, - }, - }, - }, -} - - -NESTED_EVENT_DATA = { - "nothing-exciting": "hello, world", - "user": { - "id": "test id", - "email": "test@testemail.com", - }, -} - - -NESTED_CATEGORY_TEST_CASES = [ - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": []}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "user": None}, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["unrestricted"]}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "user": None}, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifier"]}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "user": {"id": "test id", "email": None}}, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifiable-information"]}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "user": None}, - ), - ( - # User configuration for allowed_schemas - { - SCHEMA_ID: { - "allowed_categories": [ - "user-identifier", - "user-identifiable-information", - ] - } - }, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "user": { - "id": "test id", - "email": "test@testemail.com", - }, - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_properties": ["user"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "user": { - "id": "test id", - "email": "test@testemail.com", - }, - }, - ), -] - - -@pytest.mark.parametrize("allowed_schemas,expected_output", NESTED_CATEGORY_TEST_CASES) -def test_category_filtering(allowed_schemas, expected_output): - event_data = get_event_data( - NESTED_EVENT_DATA, NESTED_CATEGORY_SCHEMA, SCHEMA_ID, VERSION, allowed_schemas - ) - - # Verify that *exactly* the right properties are recorded. - assert expected_output == event_data - - -NESTED_CATEGORY_ARRAY_SCHEMA = { - "$id": SCHEMA_ID, - "title": "Test Event", - "version": VERSION, - "description": "Test Event.", - "type": "object", - "properties": { - "nothing-exciting": { - "description": "a property with nothing exciting happening", - "categories": ["unrestricted"], - "type": "string", - }, - "users": { - "description": "user", - "categories": ["user-identifier"], - "type": "array", - "items": { - "properties": { - "email": { - "description": "email address", - "categories": ["user-identifiable-information"], - "type": "string", - }, - "id": {"description": "user ID", "type": "string"}, - } - }, - }, - }, -} - - -ARRAY_EVENT_DATA = { - "nothing-exciting": "hello, world", - "users": [ - { - "id": "test id 0", - "email": "test0@testemail.com", - }, - { - "id": "test id 1", - "email": "test1@testemail.com", - }, - ], -} - - -@pytest.mark.parametrize( - "allowed_schemas,expected_output", - [ - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_policies": []}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "users": None}, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["unrestricted"]}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "users": None}, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifier"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "users": [ - { - "id": "test id 0", - "email": None, - }, - { - "id": "test id 1", - "email": None, - }, - ], - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifiable-information"]}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "users": None}, - ), - ( - # User configuration for allowed_schemas - { - SCHEMA_ID: { - "allowed_categories": [ - "user-identifier", - "user-identifiable-information", - ] - } - }, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "users": [ - { - "id": "test id 0", - "email": "test0@testemail.com", - }, - { - "id": "test id 1", - "email": "test1@testemail.com", - }, - ], - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_properties": ["users"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "users": [ - { - "id": "test id 0", - "email": "test0@testemail.com", - }, - { - "id": "test id 1", - "email": "test1@testemail.com", - }, - ], - }, - ), - ], -) -def test_array_category_filtering(allowed_schemas, expected_output): - event_data = get_event_data( - ARRAY_EVENT_DATA, - NESTED_CATEGORY_ARRAY_SCHEMA, - SCHEMA_ID, - VERSION, - allowed_schemas, - ) - - # Verify that *exactly* the right properties are recorded. - assert expected_output == event_data - - -ADDITIONAL_PROP_EVENT_DATA = { - "nothing-exciting": "hello, world", - "user": { - "id": "test id", - "email": "test@testemail.com", - }, - "extra": 1234, -} - - -@pytest.mark.parametrize( - "allowed_schemas,expected_output", - [ - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": []}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "user": None, "extra": None}, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["unrestricted"]}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "user": None, "extra": None}, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifier"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "user": {"id": "test id", "email": None}, - "extra": None, - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_categories": ["user-identifiable-information"]}}, - # Expected properties in the recorded event - {"nothing-exciting": "hello, world", "user": None, "extra": None}, - ), - ( - # User configuration for allowed_schemas - { - SCHEMA_ID: { - "allowed_categories": [ - "user-identifier", - "user-identifiable-information", - ] - } - }, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "user": { - "id": "test id", - "email": "test@testemail.com", - }, - "extra": None, - }, - ), - ( - # User configuration for allowed_schemas - {SCHEMA_ID: {"allowed_properties": ["user"]}}, - # Expected properties in the recorded event - { - "nothing-exciting": "hello, world", - "user": { - "id": "test id", - "email": "test@testemail.com", - }, - "extra": None, - }, - ), - ], -) -def test_no_additional_properties(allowed_schemas, expected_output): - event_data = get_event_data( - ADDITIONAL_PROP_EVENT_DATA, - NESTED_CATEGORY_SCHEMA, - SCHEMA_ID, - VERSION, - allowed_schemas, - ) - - # Verify that *exactly* the right properties are recorded. - assert expected_output == event_data - - -NESTED_CATEGORY_SCHEMA_ALLOF = { - "$id": SCHEMA_ID, - "title": "Test Event", - "version": VERSION, - "description": "Test Event.", - "type": "object", - "properties": { - "nothing-exciting": { - "description": "a property with nothing exciting happening", - "categories": ["unrestricted"], - "type": "string", - }, - "user": { - "description": "user", - "categories": ["user-identifier"], - "type": "object", - "allOf": [ - { - "properties": { - "email": { - "description": "email address", - "categories": ["user-identifiable-information"], - "type": "string", - } - } - }, - {"properties": {"id": {"description": "user ID", "type": "string"}}}, - ], - }, - }, -} - - -NESTED_CATEGORY_SCHEMA_REF = { - "$id": SCHEMA_ID, - "title": "Test Event", - "version": VERSION, - "description": "Test Event.", - "type": "object", - "properties": { - "nothing-exciting": { - "description": "a property with nothing exciting happening", - "categories": ["unrestricted"], - "type": "string", - }, - "user": { - "description": "user", - "categories": ["user-identifier"], - "type": "object", - "$ref": "#/definitions/properties", - }, - }, - "definitions": { - "properties": { - "properties": { - "email": { - "description": "email address", - "categories": ["user-identifiable-information"], - "type": "string", - }, - "id": {"description": "user ID", "type": "string"}, - } - } - }, -} - - -@pytest.mark.parametrize("allowed_schemas,expected_output", NESTED_CATEGORY_TEST_CASES) -def test_category_filtering_ref(allowed_schemas, expected_output): - event_data = get_event_data( - NESTED_EVENT_DATA, - NESTED_CATEGORY_SCHEMA_REF, - SCHEMA_ID, - VERSION, - allowed_schemas, - ) - - # Verify that *exactly* the right properties are recorded. - assert expected_output == event_data - - -@pytest.mark.parametrize("allowed_schemas,expected_output", NESTED_CATEGORY_TEST_CASES) -def test_category_filtering_allof(allowed_schemas, expected_output): - event_data = get_event_data( - NESTED_EVENT_DATA, - NESTED_CATEGORY_SCHEMA_ALLOF, - SCHEMA_ID, - VERSION, - allowed_schemas, - ) - - # Verify that *exactly* the right properties are recorded. - assert expected_output == event_data diff --git a/tests/test_logger.py b/tests/test_logger.py new file mode 100644 index 0000000..0e77fd3 --- /dev/null +++ b/tests/test_logger.py @@ -0,0 +1,52 @@ +import logging + +import pytest +from traitlets import TraitError +from traitlets.config.loader import PyFileConfigLoader + +from jupyter_events.logger import EventLogger + +GOOD_CONFIG = """ +import logging + +c.EventLogger.handlers = [ + logging.StreamHandler() +] +""" + +BAD_CONFIG = """ +import logging + +c.EventLogger.handlers = [ + 0 +] +""" + + +def get_config_from_file(path, content): + # Write config file + filename = "config.py" + config_file = path / filename + config_file.write_text(content) + + # Load written file. + loader = PyFileConfigLoader(filename, path=str(path)) + cfg = loader.load_config() + return cfg + + +def test_good_config_file(tmp_path): + cfg = get_config_from_file(tmp_path, GOOD_CONFIG) + + # Pass config to EventLogger + e = EventLogger(config=cfg) + + assert len(e.handlers) > 0 + assert isinstance(e.handlers[0], logging.Handler) + + +def test_bad_config_file(tmp_path): + cfg = get_config_from_file(tmp_path, BAD_CONFIG) + + with pytest.raises(TraitError): + EventLogger(config=cfg) diff --git a/tests/test_redaction.py b/tests/test_redaction.py new file mode 100644 index 0000000..3210c86 --- /dev/null +++ b/tests/test_redaction.py @@ -0,0 +1,66 @@ +import pathlib + +import pytest + +from jupyter_events.schema import EventSchema + +SCHEMA_PATH = pathlib.Path(__file__).parent / "schemas" + + +@pytest.mark.parametrize( + "schema_file,redacted_policies,data,data_out", + [ + [ + # Redact fields from objects in a nested array. + "array.yaml", + ["user-identifier", "user-identifiable-information"], + { + "nothing-exciting": "hello, world", + "users": [ + {"id": "test id 0", "email": "test0@testemail.com"}, + {"id": "test id 1", "email": "test1@testemail.com"}, + ], + }, + { + "nothing-exciting": "hello, world", + "users": [{}, {}], + }, + ], + [ + "nested-array.yaml", + ["user-identifier", "user-identifiable-information"], + { + "nothing-exciting": "hello, world", + "users": [ + { + "name": "Alice", + "hobbies": [ + {"sport": "basketball", "position": "guard"}, + {"sport": "soccer", "position": "striker"}, + ], + }, + { + "name": "Bob", + "hobbies": [ + {"sport": "basketball", "position": "center"}, + {"sport": "soccer", "position": "goalie"}, + ], + }, + ], + }, + { + "nothing-exciting": "hello, world", + "users": [ + {"hobbies": [{"sport": "basketball"}, {"sport": "soccer"}]}, + {"hobbies": [{"sport": "basketball"}, {"sport": "soccer"}]}, + ], + }, + ], + ], +) +def test_redaction_in_arrays(schema_file, redacted_policies, data, data_out): + schema = EventSchema.from_file( + SCHEMA_PATH / "good" / schema_file, redacted_policies=redacted_policies + ) + schema.enforce_redaction_policies(data) + assert data == data_out diff --git a/tests/test_schema.py b/tests/test_schema.py new file mode 100644 index 0000000..b22a743 --- /dev/null +++ b/tests/test_schema.py @@ -0,0 +1,51 @@ +import pytest +from jsonschema.exceptions import ValidationError + +from jupyter_events.validators import JUPYTER_EVENTS_VALIDATOR +from jupyter_events.yaml import yaml + +from .utils import SCHEMA_PATH + +MISSING_REDACTION_POLICY = "'redactionPolicies' is a required property" + +BAD_SCHEMAS = [ + [ + # Bad schema file. + "missing-parent-policies.yaml", + # The expected valdation error message. + MISSING_REDACTION_POLICY, + ], + ["missing-policy-array.yaml", MISSING_REDACTION_POLICY], + ["missing-policy-nested-array.yaml", MISSING_REDACTION_POLICY], +] + + +@pytest.mark.parametrize("schema_file,validation_error_msg", BAD_SCHEMAS) +def test_bad_validations(schema_file, validation_error_msg): + """ + Validation fails because the schema is missing + a redactionPolicies field. + """ + # Read the schema file + with open(SCHEMA_PATH / "bad" / schema_file) as f: + schema = yaml.load(f) + # Assert that the schema files for a known reason. + with pytest.raises(ValidationError) as err: + JUPYTER_EVENTS_VALIDATOR.validate(schema) + assert validation_error_msg in err.value.message + + +GOOD_SCHEMAS = ["array.yaml", "nested-array.yaml", "basic.yaml"] + + +@pytest.mark.parametrize("schema_file", GOOD_SCHEMAS) +def test_good_validations(schema_file): + """ + Validation fails because the schema is missing + a redactionPolicies field. + """ + # Read the schema file + with open(SCHEMA_PATH / "good" / schema_file) as f: + schema = yaml.load(f) + # Assert that the schema files for a known reason. + JUPYTER_EVENTS_VALIDATOR.validate(schema) diff --git a/tests/test_register_schema.py b/tests/test_schema_registry.py similarity index 82% rename from tests/test_register_schema.py rename to tests/test_schema_registry.py index 3a00de4..cc7fc62 100644 --- a/tests/test_register_schema.py +++ b/tests/test_schema_registry.py @@ -9,6 +9,7 @@ from ruamel.yaml import YAML from jupyter_events.logger import EventLogger +from jupyter_events.schema_registry import SchemaRegistryException def test_register_invalid_schema(): @@ -44,34 +45,29 @@ def test_missing_required_properties(): ) -def test_reserved_properties(): - """ - User schemas can't have properties starting with __ - - These are reserved - """ - el = EventLogger() - with pytest.raises(ValidationError): - el.register_schema( - { - "$id": "test/test", - "title": "Test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "properties": { - "__fail__": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - ) - - -@pytest.fixture(autouse=True) -def resetter(): - pass +# def test_reserved_properties(): +# """ +# User schemas can't have properties starting with __ + +# These are reserved +# """ +# el = EventLogger() +# # with pytest.raises(ValidationError): +# el.register_schema( +# { +# "$id": "test/test", +# "title": "Test", +# "version": 1, +# "redactionPolicies": ["unrestricted"], +# "properties": { +# "__fail__": { +# "type": "string", +# "title": "test", +# "redactionPolicies": ["unrestricted"], +# }, +# }, +# } +# ) def test_timestamp_override(): @@ -172,7 +168,7 @@ def test_register_schema_file(tmp_path): schema_file = tmp_path.joinpath("schema.yml") yaml.dump(schema, schema_file) el.register_schema_file(schema_file) - assert ("test/test3", 1) in el.schema_registry + assert ("test/test", 1) in el.schema_registry def test_register_schema_file_object(tmp_path): @@ -203,42 +199,7 @@ def test_register_schema_file_object(tmp_path): with open(str(schema_file)) as f: el.register_schema_file(f) - assert schema in el.schemas.values() - - -def test_allowed_schemas(): - """ - Events should be emitted only if their schemas are allowed - """ - schema = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - output = io.StringIO() - handler = logging.StreamHandler(output) - el = EventLogger(handlers=[handler]) - # Just register schema, but do not mark it as allowed - el.register_schema(schema) - - el.emit( - "test/test", - 1, - { - "something": "blah", - }, - ) - handler.flush() - - assert output.getvalue() == "" + assert ("test/test", 1) in el.schema_registry def test_emit_badschema(): @@ -258,6 +219,7 @@ def test_emit_badschema(): }, "status": { "enum": ["success", "failure"], + "title": "test 2", "redactionPolicies": ["unrestricted"], }, }, @@ -357,7 +319,7 @@ def test_unique_logger_instances(): def test_register_duplicate_schemas(): schema0 = { - "$id": "test/test0", + "$id": "test/test", "version": 1, "redactionPolicies": ["unrestricted"], "type": "object", @@ -371,7 +333,7 @@ def test_register_duplicate_schemas(): } schema1 = { - "$id": "test/test1", + "$id": "test/test", "version": 1, "redactionPolicies": ["unrestricted"], "type": "object", @@ -386,5 +348,5 @@ def test_register_duplicate_schemas(): el = EventLogger() el.register_schema(schema0) - with pytest.raises(ValueError): + with pytest.raises(SchemaRegistryException): el.register_schema(schema1) diff --git a/tests/utils.py b/tests/utils.py index 20bac8c..2286364 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,18 +1,21 @@ import io import json import logging +import pathlib from copy import deepcopy from jupyter_events.logger import EventLogger +SCHEMA_PATH = pathlib.Path(__file__).parent / "schemas" -def get_event_data(event, schema, schema_id, version, allowed_policies): + +def get_event_data(event, schema, schema_id, version, unredacted_policies): sink = io.StringIO() # Create a handler that captures+records events with allowed tags. handler = logging.StreamHandler(sink) - e = EventLogger(handlers=[handler], allowed_policies=allowed_policies) + e = EventLogger(handlers=[handler], unredacted_policies=unredacted_policies) e.register_schema(schema) # Record event and read output From c42cbcd5d8166b4d02b0fa0c8784b2e2d232adc3 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Tue, 19 Jul 2022 16:20:44 -0700 Subject: [PATCH 04/14] add myst docs --- docs/pages/application.rst | 7 +- docs/pages/redaction_policies.md | 5 ++ docs/pages/schemas.md | 58 +++++++++++++++++ docs/pages/schemas.rst | 64 ------------------- .../schemas/property-metaschema.yml | 1 + tests/test_redaction.py | 9 ++- 6 files changed, 75 insertions(+), 69 deletions(-) create mode 100644 docs/pages/redaction_policies.md create mode 100644 docs/pages/schemas.md delete mode 100644 docs/pages/schemas.rst diff --git a/docs/pages/application.rst b/docs/pages/application.rst index 963e460..98800d2 100644 --- a/docs/pages/application.rst +++ b/docs/pages/application.rst @@ -30,8 +30,7 @@ EventLogger has two configurable traits: - ``handlers``: a list of Python's logging handlers that handle the recording of incoming events. - - ``allowed_schemas``: a dictionary of options for each schema - describing what data should be collected. + - ``redacted_policies``: a list of `redactionPolicies` that will be removed from all emitted events. Next, you'll need to register event schemas for your application. You can register schemas using the ``register_schema_file`` @@ -39,13 +38,13 @@ You can register schemas using the ``register_schema_file`` Once your have an instance of ``EventLogger`` and your registered -schemas, you can use the ``record_event`` method to log that event. +schemas, you can use the ``emit`` method to log that event. .. code-block:: python # Record an example event. event = {'name': 'example event'} - self.eventlogger.record_event( + self.eventlogger.emit( schema_id='url.to.event.schema', version=1, event=event diff --git a/docs/pages/redaction_policies.md b/docs/pages/redaction_policies.md new file mode 100644 index 0000000..2bca8fe --- /dev/null +++ b/docs/pages/redaction_policies.md @@ -0,0 +1,5 @@ +# Redacting Sensitive Data + +Jupyter Events might possible include sensitive data, specifically personally identifiable information (PII). To reduce +the risk of capturing unwanted PII, Jupyter Events requires _every_ registered event to explicitly list its +`redactionPolicies`. Data labeled with a redacted policed will be removed from an event by Jupyter Events **before** before being emitted. Schemas that list properties without an explicit `redactionPolicies` list will fail validation. diff --git a/docs/pages/schemas.md b/docs/pages/schemas.md new file mode 100644 index 0000000..dd30f24 --- /dev/null +++ b/docs/pages/schemas.md @@ -0,0 +1,58 @@ +# Writing a schema for Jupyter Events + +Jupyter Event Schemas must be valid [JSON schema](https://json-schema.org/) and can be written in valid +YAML or JSON. Every schema is validated against Jupyter Event's "meta"-JSON schema, [here](). + +At a minimum, valid Jupyter Event schema requires have the following keys: + +- `$id` : a URI to identify (and possibly locate) the schema. +- `version` : the schema version. +- `redactionPolicies`: a list of labels representing the personal data sensitivity of this event. The main logger can be configured to redact any events or event properties that might contain sensitive information. Set this value to `"unrestricted"` if emitting that this event happen does not reveal any person data. +- `properties` : attributes of the event being emitted. + + Each property should have the following attributes: + + - `title` : name of the property + - `redactionPolicies`: a list of labels representing the personal data sensitivity of this property. This field will be redacted from the emitted event if the policy is not allowed. + +- `required`: list of required properties. + +Here is a minimal example of a valid JSON schema for an event. + +```yaml +$id: event.jupyter.org/example-event +version: 1 +title: My Event +description: | + All events must have a name property +type: object +redactionPolicy: + - category.jupyter.org/unrestricted +properties: + thing: + title: Thing + redactionPolicy: + - category.jupyter.org/unrestricted + description: A random thing. + user: + title: User name + redactionPolicies: + - category.jupyter.org/user-identifier + description: Name of user who initiated event +required: + - thing + - user +``` + +## Redaction Policies + +Each property can be labelled with `redactionPolicies` field. This makes it easier to +filter properties based on a category. We recommend that schema authors use valid +URIs for these labels, e.g. something like `category.jupyter.org/unrestricted`. + +Below is a list of common category labels that Jupyter Events recommends using: + +- `category.jupyter.org/unrestricted` +- `category.jupyter.org/user-identifier` +- `category.jupyter.org/user-identifiable-information` +- `category.jupyter.org/action-timestamp` diff --git a/docs/pages/schemas.rst b/docs/pages/schemas.rst deleted file mode 100644 index 4fd2ade..0000000 --- a/docs/pages/schemas.rst +++ /dev/null @@ -1,64 +0,0 @@ -Writing a schema for Jupyter Events -=================================== - -All Schemas should be a valid `JSON schema`_ and can be written in valid -YAML or JSON. - -At a minimum, valid Jupyter Event schema requires have the following keys: - -- ``$id`` : a URI to identify (and possibly locate) the schema. -- ``version`` : schema version. -- ``title`` : name of the schema -- ``description`` : documentation for the schema -- ``properties`` : attributes of the event being emitted. - - Each property should have the following attributes: - - + ``title`` : name of the property - + ``description``: documentation for this property. - + ``categories``: list of types of data being collected - -- ``required``: list of required properties. - -Here is a minimal example of a valid JSON schema for an event. - -.. code-block:: yaml - - $id: event.jupyter.org/example-event - version: 1 - title: My Event - description: | - All events must have a name property - type: object - properties: - thing: - title: Thing - categories: - - category.jupyter.org/unrestricted - description: A random thing. - user: - title: User name - categories: - - category.jupyter.org/user-identifier - description: Name of user who initiated event - required: - - thing - - user - - -.. _JSON schema: https://json-schema.org/ - - -Property Categories -------------------- - -Each property can be labelled with ``categories`` field. This makes it easier to -filter properties based on a category. We recommend that schema authors use valid -URIs for these labels, e.g. something like ``category.jupyter.org/unrestricted``. - -Below is a list of common category labels that Jupyter Events recommends using: - -* ``category.jupyter.org/unrestricted`` -* ``category.jupyter.org/user-identifier`` -* ``category.jupyter.org/user-identifiable-information`` -* ``category.jupyter.org/action-timestamp`` diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml index f61ef35..d36cdb2 100644 --- a/jupyter_events/schemas/property-metaschema.yml +++ b/jupyter_events/schemas/property-metaschema.yml @@ -21,6 +21,7 @@ properties: items: $ref: http://event.jupyter.org/property-metaschema additionalProperties: true + required: - title - redactionPolicies diff --git a/tests/test_redaction.py b/tests/test_redaction.py index 3210c86..61ea086 100644 --- a/tests/test_redaction.py +++ b/tests/test_redaction.py @@ -11,9 +11,11 @@ "schema_file,redacted_policies,data,data_out", [ [ - # Redact fields from objects in a nested array. + # Schema name "array.yaml", + # Redacted policies ["user-identifier", "user-identifiable-information"], + # Unredacted data { "nothing-exciting": "hello, world", "users": [ @@ -21,14 +23,18 @@ {"id": "test id 1", "email": "test1@testemail.com"}, ], }, + # Redacted data { "nothing-exciting": "hello, world", "users": [{}, {}], }, ], [ + # Schema name "nested-array.yaml", + # Redacted policies ["user-identifier", "user-identifiable-information"], + # Unredacted data { "nothing-exciting": "hello, world", "users": [ @@ -48,6 +54,7 @@ }, ], }, + # Redacted data { "nothing-exciting": "hello, world", "users": [ From 433832d8013712410caa5714d63b756adae959e6 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Tue, 19 Jul 2022 16:29:54 -0700 Subject: [PATCH 05/14] use myst for documentation --- docs/conf.py | 7 +++++-- docs/index.rst | 1 + docs/pages/configure.md | 33 +++++++++++++++++++++++++++++++ docs/pages/configure.rst | 42 ---------------------------------------- docs/requirements.txt | 3 ++- 5 files changed, 41 insertions(+), 45 deletions(-) create mode 100644 docs/pages/configure.md delete mode 100644 docs/pages/configure.rst diff --git a/docs/conf.py b/docs/conf.py index ec3fbbb..a5860a5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,11 +29,14 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions: List = [] +extensions: List = ["myst_parser"] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] +source_suffix = [".rst", ".md"] + + # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. @@ -45,7 +48,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "sphinx_rtd_theme" +html_theme = "pydata_sphinx_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/index.rst b/docs/index.rst index 5f48ca4..e839c95 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -42,6 +42,7 @@ Jupyter's Events library can be installed from PyPI. pages/configure pages/application pages/schemas + pages/redaction_policies Indices and tables ------------------ diff --git a/docs/pages/configure.md b/docs/pages/configure.md new file mode 100644 index 0000000..ae32a1b --- /dev/null +++ b/docs/pages/configure.md @@ -0,0 +1,33 @@ +.. \_using-events: + +# Using Jupyter Events in Jupyter applications + +Most people will use `jupyter_events` to log events data from Jupyter +applications, (e.g. JupyterLab, Jupyter Server, JupyterHub, etc). + +In this case, you'll be able to record events provided by schemas within +those applications. To start, you'll need to configure each +application's `EventLogger` object. + +This usually means two things: + +1. Define a set of `logging` handlers (from Python's standard library) + to tell Jupyter Events where to send your event data + (e.g. file, remote storage, etc.) +2. List redacted policies to remove sensitive data from any events. + +Here is an example of a Jupyter configuration file, e.g. `jupyter_config.d`, +that demonstrates how to configure an eventlog. + +```python +from logging import FileHandler + +# Log events to a local file on disk. +handler = FileHandler('events.txt') + +# Explicitly list the types of events +# to record and what properties or what categories +# of data to begin collecting. +c.EventLogger.handlers = [handler] +c.EventLogger.redacted_policies = ["user-identifiable-information", "user-identifier"] +``` diff --git a/docs/pages/configure.rst b/docs/pages/configure.rst deleted file mode 100644 index f1ef567..0000000 --- a/docs/pages/configure.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. _using-events: - -Using Jupyter Events in Jupyter applications -============================================ - -Most people will use ``jupyter_events`` to log events data from Jupyter -applications, (e.g. JupyterLab, Jupyter Server, JupyterHub, etc). - -In this case, you'll be able to record events provided by schemas within -those applications. To start, you'll need to configure each -application's ``EventLogger`` object. - -This usually means two things: - -1. Define a set of ``logging`` handlers (from Python's standard library) -to tell Jupyter Events where to send your event data -(e.g. file, remote storage, etc.) -2. List the names of events to collect and the properties/categories -to collect from each of those events. (see the example below for more details). - -Here is an example of a Jupyter configuration file, e.g. ``jupyter_config.d``, -that demonstrates how to configure an eventlog. - -.. code-block:: python - - from logging import FileHandler - - # Log events to a local file on disk. - handler = FileHandler('events.txt') - - # Explicitly list the types of events - # to record and what properties or what categories - # of data to begin collecting. - allowed_schemas = { - "uri.to.schema": { - "allowed_properties": ["name", "email"], - "allowed_categories": ["category.jupyter.org/user-identifier"] - } - } - - c.EventLogger.handlers = [handler] - c.EventLogger.allowed_schemas = allowed_schemas diff --git a/docs/requirements.txt b/docs/requirements.txt index 483a4e9..b9c17d5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,2 @@ -sphinx_rtd_theme +myst_parser +pydata_sphinx_theme From 4b8396339a84a220b04824516d759b29c455f91e Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Wed, 20 Jul 2022 12:30:51 -0700 Subject: [PATCH 06/14] working tests --- .pre-commit-config.yaml | 2 +- docs/conf.py | 5 +- docs/index.rst | 1 + docs/pages/demo-notebook.ipynb | 32 +++ docs/pages/demo.md | 8 + jupyter_events/logger.py | 31 +-- jupyter_events/schema.py | 7 +- jupyter_events/schema_registry.py | 55 +++-- jupyter_events/validators.py | 2 +- jupyter_events/yaml.py | 66 ++++-- pyproject.toml | 4 +- tests/test_logger.py | 341 +++++++++++++++++++++++++++++ tests/test_schema.py | 6 +- tests/test_schema_registry.py | 352 ------------------------------ 14 files changed, 503 insertions(+), 409 deletions(-) create mode 100644 docs/pages/demo-notebook.ipynb create mode 100644 docs/pages/demo.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0e8ea07..89cb4c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,7 @@ repos: hooks: - id: mypy exclude: examples/simple/setup.py - additional_dependencies: [types-requests] + additional_dependencies: [types-requests, types-PyYAML] - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.6.2 diff --git a/docs/conf.py b/docs/conf.py index a5860a5..66af829 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,7 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions: List = ["myst_parser"] +extensions: List = ["myst_parser", "jupyterlite_sphinx"] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -55,3 +55,6 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] master_doc = "index" + +# Configure jupyterlite to import jupyter_events package +jupyterlite_contents = ["pages/demo-notebook.ipynb"] diff --git a/docs/index.rst b/docs/index.rst index e839c95..a90eb6e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -43,6 +43,7 @@ Jupyter's Events library can be installed from PyPI. pages/application pages/schemas pages/redaction_policies + pages/demo Indices and tables ------------------ diff --git a/docs/pages/demo-notebook.ipynb b/docs/pages/demo-notebook.ipynb new file mode 100644 index 0000000..0c68fab --- /dev/null +++ b/docs/pages/demo-notebook.ipynb @@ -0,0 +1,32 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import piplite\n", + "\n", + "piplite.install(\"jupyter_events\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from jupyter_events.logger import EventLogger\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/pages/demo.md b/docs/pages/demo.md new file mode 100644 index 0000000..fb12e70 --- /dev/null +++ b/docs/pages/demo.md @@ -0,0 +1,8 @@ +# Jupyter Events Demo + +```{retrolite} demo-notebook.ipynb +--- +width: 100% +height: 600px +--- +``` diff --git a/jupyter_events/logger.py b/jupyter_events/logger.py index 8f425c9..60ef881 100644 --- a/jupyter_events/logger.py +++ b/jupyter_events/logger.py @@ -39,10 +39,10 @@ class EventLogger(Configurable): ), ) - schema_registry = Instance(SchemaRegistry) + schemas = Instance(SchemaRegistry) - @default("schema_registry") - def _default_schema_registry(self): + @default("schemas") + def _default_schemas(self): return SchemaRegistry(redacted_policies=self.redacted_policies) def __init__(self, *args, **kwargs): @@ -79,13 +79,19 @@ def get_handlers(): eventlogger_cfg = Config({"EventLogger": my_cfg}) super()._load_config(eventlogger_cfg, section_names=None, traits=None) - def register_schema(self, schema: dict): - """Register events schema with the SchemaRegistry.""" - self.schema_registry.register(schema) + def register_schema(self, schema): + """Register this schema with the schema registry. - def register_schema_file(self, schema_filepath): - """Register events schema with the SchemaRegistry.""" - self.schema_registry.register_from_file(schema_filepath) + Get this registered schema using the EventLogger.schema.get() method. + """ + self.schemas.register(schema) + + def register_schema_file(self, schema_file): + """Register this schema with the schema registry. + + Get this registered schema using the EventLogger.schema.get() method. + """ + self.schemas.register_from_file(schema_file) def add_handler(self, handler: logging.Handler): """Add a new logging handler to the Event Logger. @@ -134,7 +140,7 @@ def emit(self, schema_name, version, event, timestamp_override=None): dict The recorded event data """ - if not self.handlers or (schema_name, version) not in self.schema_registry: + if not self.handlers or (schema_name, version) not in self.schemas: # if handler isn't set up or schema is not explicitly whitelisted, # don't do anything return @@ -150,9 +156,8 @@ def emit(self, schema_name, version, event, timestamp_override=None): "__schema_version__": version, "__metadata_version__": EVENTS_METADATA_VERSION, } - schema = self.schema_registry.get((schema_name, version)) - schema.validate(event) - schema.enforce_redaction_policies(event) + # Process this event, i.e. validate and redact (in place) + self.schemas.process_event(schema_name, version, event) capsule.update(event) self.log.info(capsule) return capsule diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py index 08272fb..67ea69d 100644 --- a/jupyter_events/schema.py +++ b/jupyter_events/schema.py @@ -2,8 +2,8 @@ from jsonschema import validators +from . import yaml from .validators import JUPYTER_EVENTS_VALIDATOR -from .yaml import yaml def _pop_nested_redacted_fields( @@ -180,11 +180,12 @@ def enforce_redaction_policies(self, data: dict) -> None: # redacted_policies = named_policies - self.unredacted_policies for policy_type in self.redacted_policies: policy_locations = self._redaction_policies_locations[policy_type] - print(policy_type, policy_locations) for item in policy_locations: _pop_nested_redacted_fields(data, item) def process(self, data: dict) -> None: - """Validate event data and enforce an redaction policies.""" + """Validate event data and enforce an redaction policies (in place). + Nothing is returned by this method, because the data is redacted in place. + """ self.validate(data) self.enforce_redaction_policies(data) diff --git a/jupyter_events/schema_registry.py b/jupyter_events/schema_registry.py index 0f070ea..95fca6b 100644 --- a/jupyter_events/schema_registry.py +++ b/jupyter_events/schema_registry.py @@ -1,4 +1,4 @@ -from multiprocessing import Event +from typing import Any, List, Optional, Tuple from .schema import EventSchema @@ -8,15 +8,20 @@ class SchemaRegistryException(Exception): class SchemaRegistry: - def __init__(self, schemas=None, redacted_policies=None): + """A convenient API for storing and searching a group of schemas.""" + + def __init__(self, schemas: dict = None, redacted_policies: list = None): self._schemas = schemas or {} self._redacted_policies = redacted_policies @property - def redacted_policies(self): + def redacted_policies(self) -> Optional[List[Any]]: + """A list of policies that will be redacted from + all events validated against this registry. + """ return self._redacted_policies - def __contains__(self, registry_key): + def __contains__(self, registry_key: Tuple): """Syntax sugar to check if a schema is found in the registry""" return registry_key in self._schemas @@ -30,7 +35,11 @@ def _add(self, schema_obj: EventSchema): self._schemas[schema_obj.registry_key] = schema_obj def register(self, schema_data): - """Register a schema.""" + """Add a valid schema to the registry. + + All schemas are validated against the Jupyter Events meta-schema + found here: + """ schema = EventSchema(schema_data, redacted_policies=self.redacted_policies) self._add(schema) @@ -41,22 +50,42 @@ def register_from_file(self, schema_filepath): ) self._add(schema) - def get(self, registry_key) -> EventSchema: + def get(self, id: str, version: int) -> EventSchema: + """Fetch a given schema. If the schema is not found, + this will raise a KeyError. + """ try: - return self._schemas[registry_key] + return self._schemas[(id, version)] except KeyError: raise KeyError( - f"The requested schema, {registry_key[0]} " - f"(version {registry_key[1]}), was not found in the " + f"The requested schema, {id} " + f"(version {version}), was not found in the " "schema registry. Are you sure it was previously registered?" ) - def remove(self, registry_key): + def remove(self, id: str, version: int) -> None: + """Remove a given schema. If the schema is not found, + this will raise a KeyError. + """ try: - del self._schemas[registry_key] + del self._schemas[(id, version)] except KeyError: raise KeyError( - f"The requested schema, {registry_key[0]} " - f"(version {registry_key[1]}), was not found in the " + f"The requested schema, {id} " + f"(version {version}), was not found in the " "schema registry. Are you sure it was previously registered?" ) + + def validate_event(self, id: str, version: int, data: dict) -> None: + """Validate an event against a schema within this + registry. + """ + schema = self.get(id, version) + schema.validate(data) + + def process_event(self, id: str, version: int, data: dict) -> None: + """Validate and event and enforce an redaction policies (in place). + Nothing is returned by this method, because the data is redacted in place. + """ + schema = self.get(id, version) + schema.process(data) diff --git a/jupyter_events/validators.py b/jupyter_events/validators.py index 60443d5..92a86e2 100644 --- a/jupyter_events/validators.py +++ b/jupyter_events/validators.py @@ -2,7 +2,7 @@ from jsonschema import RefResolver, validators -from .yaml import yaml +from . import yaml METASCHEMA_PATH = pathlib.Path(__file__).parent.joinpath("schemas") EVENT_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("event-metaschema.yml") diff --git a/jupyter_events/yaml.py b/jupyter_events/yaml.py index 4f839cc..526a72a 100644 --- a/jupyter_events/yaml.py +++ b/jupyter_events/yaml.py @@ -1,21 +1,47 @@ +# mypy: ignore-errors + +# try: +# from ruamel.yaml import YAML +# except ImportError as e: +# # check for known conda bug that prevents +# # pip from installing ruamel.yaml dependency +# try: +# import ruamel_yaml # noqa +# except ImportError: +# # nope, regular import error; raise original +# raise e +# else: +# # have conda fork ruamel_yaml, but not ruamel.yaml. +# # this is a bug in the ruamel_yaml conda package +# # mistakenly identifying itself as ruamel.yaml to pip. +# # conda install the 'real' ruamel.yaml to fix +# raise ImportError( +# "Missing dependency ruamel.yaml. Try: `conda install ruamel.yaml`" +# ) +import pathlib + +from yaml import dump as ydump +from yaml import load as yload + try: - from ruamel.yaml import YAML -except ImportError as e: - # check for known conda bug that prevents - # pip from installing ruamel.yaml dependency - try: - import ruamel_yaml # noqa - except ImportError: - # nope, regular import error; raise original - raise e - else: - # have conda fork ruamel_yaml, but not ruamel.yaml. - # this is a bug in the ruamel_yaml conda package - # mistakenly identifying itself as ruamel.yaml to pip. - # conda install the 'real' ruamel.yaml to fix - raise ImportError( - "Missing dependency ruamel.yaml. Try: `conda install ruamel.yaml`" - ) - - -yaml = YAML(typ="safe") + from yaml import CDumper as Dumper + from yaml import CLoader as Loader +except ImportError: + from yaml import Dumper, Loader + + +def loads(stream): + return yload(stream, Loader=Loader) + + +def dumps(stream): + return ydump(stream, Dumper=Dumper) + + +def load(fpath): + data = pathlib.Path(str(fpath)).read_text() + return loads(data) + + +def dump(data, outpath): + pathlib.Path(outpath).write_text(dumps(data)) diff --git a/pyproject.toml b/pyproject.toml index 4eeb5dd..3eccb1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,14 +26,14 @@ classifiers = [ dependencies = [ "jsonschema", "python-json-logger", - "ruamel.yaml", + "pyyaml", "traitlets", ] dynamic = [ "version", ] -[project.readme] +[projects.readme] file = "README.md" content-type = "text/markdown" diff --git a/tests/test_logger.py b/tests/test_logger.py index 0e77fd3..ad8e79b 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,10 +1,17 @@ +import io +import json import logging +from datetime import datetime, timedelta +import jsonschema import pytest +from jsonschema.exceptions import ValidationError from traitlets import TraitError from traitlets.config.loader import PyFileConfigLoader +from jupyter_events import yaml from jupyter_events.logger import EventLogger +from jupyter_events.schema_registry import SchemaRegistryException GOOD_CONFIG = """ import logging @@ -50,3 +57,337 @@ def test_bad_config_file(tmp_path): with pytest.raises(TraitError): EventLogger(config=cfg) + + +def test_register_invalid_schema(): + """ + Invalid JSON Schemas should fail registration + """ + el = EventLogger() + with pytest.raises(ValidationError): + el.register_schema( + { + # Totally invalid + "properties": True + } + ) + + +def test_missing_required_properties(): + """ + id and $version are required properties in our schemas. + + They aren't required by JSON Schema itself + """ + el = EventLogger() + with pytest.raises(ValidationError): + el.register_schema({"properties": {}}) + + with pytest.raises(ValidationError): + el.register_schema( + { + "$id": "something", + "$version": 1, # This should been 'version' + } + ) + + +# def test_reserved_properties(): +# """ +# User schemas can't have properties starting with __ + +# These are reserved +# """ +# el = EventLogger() +# # with pytest.raises(ValidationError): +# el.register_schema( +# { +# "$id": "test/test", +# "title": "Test", +# "version": 1, +# "redactionPolicies": ["unrestricted"], +# "properties": { +# "__fail__": { +# "type": "string", +# "title": "test", +# "redactionPolicies": ["unrestricted"], +# }, +# }, +# } +# ) + + +def test_timestamp_override(): + """ + Simple test for overriding timestamp + """ + schema = { + "$id": "test/test", + "version": 1, + "redactionPolicies": ["unrestricted"], + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + output = io.StringIO() + handler = logging.StreamHandler(output) + el = EventLogger(handlers=[handler]) + el.register_schema(schema) + + timestamp_override = datetime.utcnow() - timedelta(days=1) + el.emit( + "test/test", 1, {"something": "blah"}, timestamp_override=timestamp_override + ) + handler.flush() + event_capsule = json.loads(output.getvalue()) + assert event_capsule["__timestamp__"] == timestamp_override.isoformat() + "Z" + + +def test_emit(): + """ + Simple test for emitting valid events + """ + schema = { + "$id": "test/test", + "version": 1, + "redactionPolicies": ["unrestricted"], + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + output = io.StringIO() + handler = logging.StreamHandler(output) + el = EventLogger(handlers=[handler]) + el.register_schema(schema) + + el.emit( + "test/test", + 1, + { + "something": "blah", + }, + ) + handler.flush() + + event_capsule = json.loads(output.getvalue()) + + assert "__timestamp__" in event_capsule + # Remove timestamp from capsule when checking equality, since it is gonna vary + del event_capsule["__timestamp__"] + assert event_capsule == { + "__schema__": "test/test", + "__schema_version__": 1, + "__metadata_version__": 1, + "something": "blah", + } + + +def test_register_schema_file(tmp_path): + """ + Register schema from a file + """ + schema = { + "$id": "test/test", + "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + el = EventLogger() + schema_file = tmp_path.joinpath("schema.yml") + yaml.dump(schema, schema_file) + el.register_schema_file(schema_file) + assert ("test/test", 1) in el.schemas + + +def test_register_schema_file_object(tmp_path): + """ + Register schema from a file + """ + schema = { + "$id": "test/test", + "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + el = EventLogger() + schema_file = tmp_path.joinpath("schema.yml") + yaml.dump(schema, schema_file) + el.register_schema_file(schema_file) + + assert ("test/test", 1) in el.schemas + + +def test_emit_badschema(): + """ + Fail fast when an event doesn't conform to its schema + """ + schema = { + "$id": "test/test", + "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + "status": { + "enum": ["success", "failure"], + "title": "test 2", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + el = EventLogger(handlers=[logging.NullHandler()]) + el.register_schema(schema) + el.allowed_schemas = ["test/test"] + + with pytest.raises(jsonschema.ValidationError): + el.emit("test/test", 1, {"something": "blah", "status": "hi"}) # 'not-in-enum' + + +def test_unique_logger_instances(): + schema0 = { + "$id": "test/test0", + "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + schema1 = { + "$id": "test/test1", + "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + output0 = io.StringIO() + output1 = io.StringIO() + handler0 = logging.StreamHandler(output0) + handler1 = logging.StreamHandler(output1) + + el0 = EventLogger(handlers=[handler0]) + el0.register_schema(schema0) + el0.allowed_schemas = ["test/test0"] + + el1 = EventLogger(handlers=[handler1]) + el1.register_schema(schema1) + el1.allowed_schemas = ["test/test1"] + + el0.emit( + "test/test0", + 1, + { + "something": "blah", + }, + ) + el1.emit( + "test/test1", + 1, + { + "something": "blah", + }, + ) + handler0.flush() + handler1.flush() + + event_capsule0 = json.loads(output0.getvalue()) + + assert "__timestamp__" in event_capsule0 + # Remove timestamp from capsule when checking equality, since it is gonna vary + del event_capsule0["__timestamp__"] + assert event_capsule0 == { + "__schema__": "test/test0", + "__schema_version__": 1, + "__metadata_version__": 1, + "something": "blah", + } + + event_capsule1 = json.loads(output1.getvalue()) + + assert "__timestamp__" in event_capsule1 + # Remove timestamp from capsule when checking equality, since it is gonna vary + del event_capsule1["__timestamp__"] + assert event_capsule1 == { + "__schema__": "test/test1", + "__schema_version__": 1, + "__metadata_version__": 1, + "something": "blah", + } + + +def test_register_duplicate_schemas(): + schema0 = { + "$id": "test/test", + "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + schema1 = { + "$id": "test/test", + "version": 1, + "redactionPolicies": ["unrestricted"], + "type": "object", + "properties": { + "something": { + "type": "string", + "title": "test", + "redactionPolicies": ["unrestricted"], + }, + }, + } + + el = EventLogger() + el.register_schema(schema0) + with pytest.raises(SchemaRegistryException): + el.register_schema(schema1) diff --git a/tests/test_schema.py b/tests/test_schema.py index b22a743..16ad714 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1,8 +1,8 @@ import pytest from jsonschema.exceptions import ValidationError +from jupyter_events import yaml from jupyter_events.validators import JUPYTER_EVENTS_VALIDATOR -from jupyter_events.yaml import yaml from .utils import SCHEMA_PATH @@ -28,7 +28,7 @@ def test_bad_validations(schema_file, validation_error_msg): """ # Read the schema file with open(SCHEMA_PATH / "bad" / schema_file) as f: - schema = yaml.load(f) + schema = yaml.loads(f) # Assert that the schema files for a known reason. with pytest.raises(ValidationError) as err: JUPYTER_EVENTS_VALIDATOR.validate(schema) @@ -46,6 +46,6 @@ def test_good_validations(schema_file): """ # Read the schema file with open(SCHEMA_PATH / "good" / schema_file) as f: - schema = yaml.load(f) + schema = yaml.loads(f) # Assert that the schema files for a known reason. JUPYTER_EVENTS_VALIDATOR.validate(schema) diff --git a/tests/test_schema_registry.py b/tests/test_schema_registry.py index cc7fc62..e69de29 100644 --- a/tests/test_schema_registry.py +++ b/tests/test_schema_registry.py @@ -1,352 +0,0 @@ -import io -import json -import logging -from datetime import datetime, timedelta - -import jsonschema -import pytest -from jsonschema.exceptions import ValidationError -from ruamel.yaml import YAML - -from jupyter_events.logger import EventLogger -from jupyter_events.schema_registry import SchemaRegistryException - - -def test_register_invalid_schema(): - """ - Invalid JSON Schemas should fail registration - """ - el = EventLogger() - with pytest.raises(ValidationError): - el.register_schema( - { - # Totally invalid - "properties": True - } - ) - - -def test_missing_required_properties(): - """ - id and $version are required properties in our schemas. - - They aren't required by JSON Schema itself - """ - el = EventLogger() - with pytest.raises(ValidationError): - el.register_schema({"properties": {}}) - - with pytest.raises(ValidationError): - el.register_schema( - { - "$id": "something", - "$version": 1, # This should been 'version' - } - ) - - -# def test_reserved_properties(): -# """ -# User schemas can't have properties starting with __ - -# These are reserved -# """ -# el = EventLogger() -# # with pytest.raises(ValidationError): -# el.register_schema( -# { -# "$id": "test/test", -# "title": "Test", -# "version": 1, -# "redactionPolicies": ["unrestricted"], -# "properties": { -# "__fail__": { -# "type": "string", -# "title": "test", -# "redactionPolicies": ["unrestricted"], -# }, -# }, -# } -# ) - - -def test_timestamp_override(): - """ - Simple test for overriding timestamp - """ - schema = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - output = io.StringIO() - handler = logging.StreamHandler(output) - el = EventLogger(handlers=[handler]) - el.register_schema(schema) - - timestamp_override = datetime.utcnow() - timedelta(days=1) - el.emit( - "test/test", 1, {"something": "blah"}, timestamp_override=timestamp_override - ) - handler.flush() - event_capsule = json.loads(output.getvalue()) - assert event_capsule["__timestamp__"] == timestamp_override.isoformat() + "Z" - - -def test_emit(): - """ - Simple test for emitting valid events - """ - schema = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - output = io.StringIO() - handler = logging.StreamHandler(output) - el = EventLogger(handlers=[handler]) - el.register_schema(schema) - - el.emit( - "test/test", - 1, - { - "something": "blah", - }, - ) - handler.flush() - - event_capsule = json.loads(output.getvalue()) - - assert "__timestamp__" in event_capsule - # Remove timestamp from capsule when checking equality, since it is gonna vary - del event_capsule["__timestamp__"] - assert event_capsule == { - "__schema__": "test/test", - "__schema_version__": 1, - "__metadata_version__": 1, - "something": "blah", - } - - -def test_register_schema_file(tmp_path): - """ - Register schema from a file - """ - schema = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - el = EventLogger() - yaml = YAML(typ="safe") - schema_file = tmp_path.joinpath("schema.yml") - yaml.dump(schema, schema_file) - el.register_schema_file(schema_file) - assert ("test/test", 1) in el.schema_registry - - -def test_register_schema_file_object(tmp_path): - """ - Register schema from a file - """ - schema = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - el = EventLogger() - - yaml = YAML(typ="safe") - - schema_file = tmp_path.joinpath("schema.yml") - yaml.dump(schema, schema_file) - - with open(str(schema_file)) as f: - el.register_schema_file(f) - - assert ("test/test", 1) in el.schema_registry - - -def test_emit_badschema(): - """ - Fail fast when an event doesn't conform to its schema - """ - schema = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - "status": { - "enum": ["success", "failure"], - "title": "test 2", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - el = EventLogger(handlers=[logging.NullHandler()]) - el.register_schema(schema) - el.allowed_schemas = ["test/test"] - - with pytest.raises(jsonschema.ValidationError): - el.emit("test/test", 1, {"something": "blah", "status": "hi"}) # 'not-in-enum' - - -def test_unique_logger_instances(): - schema0 = { - "$id": "test/test0", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - schema1 = { - "$id": "test/test1", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - output0 = io.StringIO() - output1 = io.StringIO() - handler0 = logging.StreamHandler(output0) - handler1 = logging.StreamHandler(output1) - - el0 = EventLogger(handlers=[handler0]) - el0.register_schema(schema0) - el0.allowed_schemas = ["test/test0"] - - el1 = EventLogger(handlers=[handler1]) - el1.register_schema(schema1) - el1.allowed_schemas = ["test/test1"] - - el0.emit( - "test/test0", - 1, - { - "something": "blah", - }, - ) - el1.emit( - "test/test1", - 1, - { - "something": "blah", - }, - ) - handler0.flush() - handler1.flush() - - event_capsule0 = json.loads(output0.getvalue()) - - assert "__timestamp__" in event_capsule0 - # Remove timestamp from capsule when checking equality, since it is gonna vary - del event_capsule0["__timestamp__"] - assert event_capsule0 == { - "__schema__": "test/test0", - "__schema_version__": 1, - "__metadata_version__": 1, - "something": "blah", - } - - event_capsule1 = json.loads(output1.getvalue()) - - assert "__timestamp__" in event_capsule1 - # Remove timestamp from capsule when checking equality, since it is gonna vary - del event_capsule1["__timestamp__"] - assert event_capsule1 == { - "__schema__": "test/test1", - "__schema_version__": 1, - "__metadata_version__": 1, - "something": "blah", - } - - -def test_register_duplicate_schemas(): - schema0 = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - schema1 = { - "$id": "test/test", - "version": 1, - "redactionPolicies": ["unrestricted"], - "type": "object", - "properties": { - "something": { - "type": "string", - "title": "test", - "redactionPolicies": ["unrestricted"], - }, - }, - } - - el = EventLogger() - el.register_schema(schema0) - with pytest.raises(SchemaRegistryException): - el.register_schema(schema1) From 7f17a631ebef19a30fbb804c84d429fa3b984ed5 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 21 Jul 2022 07:50:25 -0700 Subject: [PATCH 07/14] unit test working with latest jsonschema --- .github/workflows/python-tests.yml | 1 + jupyter_events/schemas/event-metaschema.yml | 2 -- jupyter_events/schemas/property-metaschema.yml | 1 - .../schemas/reserved-property-metaschema.yml | 4 ++-- jupyter_events/validators.py | 15 +++++++-------- tests/schemas/good/basic.yaml | 3 +-- tests/test_schema.py | 1 + 7 files changed, 12 insertions(+), 15 deletions(-) diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index 3f692cd..77c9d46 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -30,6 +30,7 @@ jobs: - name: Install the Python dependencies run: | pip install -e ".[test]" codecov + pip list - name: Run the tests if: ${{ !startsWith(matrix.python-version, 'pypy') && !startsWith(matrix.os, 'windows') }} run: | diff --git a/jupyter_events/schemas/event-metaschema.yml b/jupyter_events/schemas/event-metaschema.yml index 2ce1b6e..14aa90e 100644 --- a/jupyter_events/schemas/event-metaschema.yml +++ b/jupyter_events/schemas/event-metaschema.yml @@ -7,8 +7,6 @@ description: | schemas are appropriately defined. type: object properties: - $id: - type: string version: type: integer title: diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml index d36cdb2..6ff27cd 100644 --- a/jupyter_events/schemas/property-metaschema.yml +++ b/jupyter_events/schemas/property-metaschema.yml @@ -20,7 +20,6 @@ properties: $ref: http://event.jupyter.org/property-metaschema items: $ref: http://event.jupyter.org/property-metaschema - additionalProperties: true required: - title diff --git a/jupyter_events/schemas/reserved-property-metaschema.yml b/jupyter_events/schemas/reserved-property-metaschema.yml index 42411b7..204ed2f 100644 --- a/jupyter_events/schemas/reserved-property-metaschema.yml +++ b/jupyter_events/schemas/reserved-property-metaschema.yml @@ -6,7 +6,7 @@ description: | Property names that are reserved for Jupyter Events and should not be included in an Event Schema. patternProperties: - ^__: + (?!__): type: - string - object @@ -14,4 +14,4 @@ patternProperties: - integer - boolean - array -minProperties: 1 +additionalProperties: false diff --git a/jupyter_events/validators.py b/jupyter_events/validators.py index 92a86e2..14f7abe 100644 --- a/jupyter_events/validators.py +++ b/jupyter_events/validators.py @@ -13,14 +13,13 @@ "reserved-property-metaschema.yml" ) RESERVED_PROPERTY_METASCHEMA = yaml.load(RESERVED_PROPERTY_METASCHEMA_FILEPATH) +SCHEMA_STORE = { + PROPERTY_METASCHEMA["$id"]: PROPERTY_METASCHEMA, + RESERVED_PROPERTY_METASCHEMA["$id"]: RESERVED_PROPERTY_METASCHEMA, +} METASCHEMA_RESOLVER = RefResolver( - base_uri=EVENT_METASCHEMA["$id"], - referrer=EVENT_METASCHEMA, - store={ - PROPERTY_METASCHEMA["$id"]: PROPERTY_METASCHEMA, - RESERVED_PROPERTY_METASCHEMA["$id"]: RESERVED_PROPERTY_METASCHEMA, - }, + base_uri=EVENT_METASCHEMA["$id"], referrer=EVENT_METASCHEMA, store=SCHEMA_STORE ) -JUPYTER_EVENTS_VALIDATOR = validators.Draft7Validator( - EVENT_METASCHEMA, resolver=METASCHEMA_RESOLVER +JUPYTER_EVENTS_VALIDATOR = validators.Draft202012Validator( + schema=EVENT_METASCHEMA, resolver=METASCHEMA_RESOLVER ) diff --git a/tests/schemas/good/basic.yaml b/tests/schemas/good/basic.yaml index 6511819..b33b45e 100644 --- a/tests/schemas/good/basic.yaml +++ b/tests/schemas/good/basic.yaml @@ -9,8 +9,7 @@ redactionPolicies: properties: prop: title: Test Property - description: | - Test property. + description: Test property. redactionPolicies: - unrestricted type: string diff --git a/tests/test_schema.py b/tests/test_schema.py index 16ad714..57ec212 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -17,6 +17,7 @@ ], ["missing-policy-array.yaml", MISSING_REDACTION_POLICY], ["missing-policy-nested-array.yaml", MISSING_REDACTION_POLICY], + # ["reserved-property.yaml", "Something"] ] From b972ed3a99d14a5c62f1f57ec030fa41bfca4d81 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 21 Jul 2022 12:12:24 -0700 Subject: [PATCH 08/14] protect reserved property names (starting with __) --- jupyter_events/schema.py | 4 ++-- jupyter_events/schemas/event-metaschema.yml | 2 ++ .../schemas/property-metaschema.yml | 10 ++++++++ .../schemas/reserved-property-metaschema.yml | 17 ------------- jupyter_events/validators.py | 24 ++++++++++++++----- tests/schemas/bad/reserved-property.yaml | 15 ++++++++++++ tests/test_schema.py | 8 +++---- 7 files changed, 51 insertions(+), 29 deletions(-) delete mode 100644 jupyter_events/schemas/reserved-property-metaschema.yml create mode 100644 tests/schemas/bad/reserved-property.yaml diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py index 67ea69d..42b3209 100644 --- a/jupyter_events/schema.py +++ b/jupyter_events/schema.py @@ -3,7 +3,7 @@ from jsonschema import validators from . import yaml -from .validators import JUPYTER_EVENTS_VALIDATOR +from .validators import validate_schema def _pop_nested_redacted_fields( @@ -104,7 +104,7 @@ def __init__( redacted_policies: Union[str, list, None] = None, ): # Validate the schema against Jupyter Events metaschema. - JUPYTER_EVENTS_VALIDATOR.validate(schema) + validate_schema(schema) # Build a mapping of all property redaction policies. self._redaction_policies_locations = _find_redaction_policies(schema) self._redacted_policies = self._validate_redacted_policies(redacted_policies) diff --git a/jupyter_events/schemas/event-metaschema.yml b/jupyter_events/schemas/event-metaschema.yml index 14aa90e..9bbd7e5 100644 --- a/jupyter_events/schemas/event-metaschema.yml +++ b/jupyter_events/schemas/event-metaschema.yml @@ -21,6 +21,8 @@ properties: type: object additionalProperties: $ref: http://event.jupyter.org/property-metaschema + propertyNames: + pattern: ^(?!__.*) required: - $id diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml index 6ff27cd..f2f58e3 100644 --- a/jupyter_events/schemas/property-metaschema.yml +++ b/jupyter_events/schemas/property-metaschema.yml @@ -5,6 +5,7 @@ title: Property Metaschema description: | A metaschema for validating properties within an event schema + properties: title: type: string @@ -18,9 +19,18 @@ properties: type: object additionalProperties: $ref: http://event.jupyter.org/property-metaschema + propertyNames: + pattern: ^(?!__.*) + items: $ref: http://event.jupyter.org/property-metaschema required: - title - redactionPolicies + +additionalProperties: + $ref: http://event.jupyter.org/property-metaschema + +propertyNames: + pattern: ^(?!__.*) diff --git a/jupyter_events/schemas/reserved-property-metaschema.yml b/jupyter_events/schemas/reserved-property-metaschema.yml deleted file mode 100644 index 204ed2f..0000000 --- a/jupyter_events/schemas/reserved-property-metaschema.yml +++ /dev/null @@ -1,17 +0,0 @@ -$schema: http://json-schema.org/draft-07/schema -$id: http://event.jupyter.org/reserved-property-metaschema -version: 1 -title: Reserved Property Metaschema -description: | - Property names that are reserved for Jupyter Events and should not - be included in an Event Schema. -patternProperties: - (?!__): - type: - - string - - object - - number - - integer - - boolean - - array -additionalProperties: false diff --git a/jupyter_events/validators.py b/jupyter_events/validators.py index 14f7abe..6f55ccd 100644 --- a/jupyter_events/validators.py +++ b/jupyter_events/validators.py @@ -1,6 +1,7 @@ import pathlib +from xml.etree.ElementPath import prepare_self -from jsonschema import RefResolver, validators +from jsonschema import RefResolver, ValidationError, validators from . import yaml @@ -9,13 +10,8 @@ EVENT_METASCHEMA = yaml.load(EVENT_METASCHEMA_FILEPATH) PROPERTY_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath("property-metaschema.yml") PROPERTY_METASCHEMA = yaml.load(PROPERTY_METASCHEMA_FILEPATH) -RESERVED_PROPERTY_METASCHEMA_FILEPATH = METASCHEMA_PATH.joinpath( - "reserved-property-metaschema.yml" -) -RESERVED_PROPERTY_METASCHEMA = yaml.load(RESERVED_PROPERTY_METASCHEMA_FILEPATH) SCHEMA_STORE = { PROPERTY_METASCHEMA["$id"]: PROPERTY_METASCHEMA, - RESERVED_PROPERTY_METASCHEMA["$id"]: RESERVED_PROPERTY_METASCHEMA, } METASCHEMA_RESOLVER = RefResolver( base_uri=EVENT_METASCHEMA["$id"], referrer=EVENT_METASCHEMA, store=SCHEMA_STORE @@ -23,3 +19,19 @@ JUPYTER_EVENTS_VALIDATOR = validators.Draft202012Validator( schema=EVENT_METASCHEMA, resolver=METASCHEMA_RESOLVER ) + + +def validate_schema(schema: dict): + try: + # Validate the schema against Jupyter Events metaschema. + JUPYTER_EVENTS_VALIDATOR.validate(schema) + except ValidationError as err: + reserved_property_msg = " does not match '^(?!__.*)'" + if reserved_property_msg in err.message: + bad_property = err.message[: -(len(reserved_property_msg))] + raise ValidationError( + f"{bad_property} is an invalid property name because it " + "starts with `__`. Properties starting with 'dunder' " + "are reserved for Jupyter Events." + ) + raise err diff --git a/tests/schemas/bad/reserved-property.yaml b/tests/schemas/bad/reserved-property.yaml new file mode 100644 index 0000000..d543823 --- /dev/null +++ b/tests/schemas/bad/reserved-property.yaml @@ -0,0 +1,15 @@ +$id: http://event.jupyter.org/test +version: 1 +title: Simple Test Schema +description: | + A simple schema for testing +type: object +redactionPolicies: + - unrestricted +properties: + __badName: + title: Test Property + description: Test property. + redactionPolicies: + - unrestricted + type: string diff --git a/tests/test_schema.py b/tests/test_schema.py index 57ec212..830c57d 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -2,7 +2,7 @@ from jsonschema.exceptions import ValidationError from jupyter_events import yaml -from jupyter_events.validators import JUPYTER_EVENTS_VALIDATOR +from jupyter_events.validators import validate_schema from .utils import SCHEMA_PATH @@ -17,7 +17,7 @@ ], ["missing-policy-array.yaml", MISSING_REDACTION_POLICY], ["missing-policy-nested-array.yaml", MISSING_REDACTION_POLICY], - # ["reserved-property.yaml", "Something"] + ["reserved-property.yaml", "Properties starting with 'dunder'"], ] @@ -32,7 +32,7 @@ def test_bad_validations(schema_file, validation_error_msg): schema = yaml.loads(f) # Assert that the schema files for a known reason. with pytest.raises(ValidationError) as err: - JUPYTER_EVENTS_VALIDATOR.validate(schema) + validate_schema(schema) assert validation_error_msg in err.value.message @@ -49,4 +49,4 @@ def test_good_validations(schema_file): with open(SCHEMA_PATH / "good" / schema_file) as f: schema = yaml.loads(f) # Assert that the schema files for a known reason. - JUPYTER_EVENTS_VALIDATOR.validate(schema) + validate_schema(schema) From be49d43d439059858782db349413e4c2a6ac1e82 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 21 Jul 2022 12:12:50 -0700 Subject: [PATCH 09/14] protect reserved property names (starting with __) --- tests/test_logger.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/tests/test_logger.py b/tests/test_logger.py index ad8e79b..a05fc09 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -92,31 +92,6 @@ def test_missing_required_properties(): ) -# def test_reserved_properties(): -# """ -# User schemas can't have properties starting with __ - -# These are reserved -# """ -# el = EventLogger() -# # with pytest.raises(ValidationError): -# el.register_schema( -# { -# "$id": "test/test", -# "title": "Test", -# "version": 1, -# "redactionPolicies": ["unrestricted"], -# "properties": { -# "__fail__": { -# "type": "string", -# "title": "test", -# "redactionPolicies": ["unrestricted"], -# }, -# }, -# } -# ) - - def test_timestamp_override(): """ Simple test for overriding timestamp From a59619e94c9d828a27d60ab46da44442bd2f7c27 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 21 Jul 2022 13:45:17 -0700 Subject: [PATCH 10/14] more typing --- jupyter_events/logger.py | 43 ++++++++++++++++++------------- jupyter_events/schema.py | 18 +++++++------ jupyter_events/schema_registry.py | 8 +++--- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/jupyter_events/logger.py b/jupyter_events/logger.py index 60ef881..861c330 100644 --- a/jupyter_events/logger.py +++ b/jupyter_events/logger.py @@ -16,7 +16,12 @@ class EventLogger(Configurable): """ - Send structured events to a logging sink + An Event logger for emitting structured events. + + Event schemas must be registered with the + EventLogger using the `register_schema` or + `register_schema_file` methods. Every schema + will be validated against Jupyter Event's metaschema. """ handlers = Handlers( @@ -24,25 +29,27 @@ class EventLogger(Configurable): allow_none=True, help="""A list of logging.Handler instances to send events to. - When set to None (the default), events are discarded. + When set to None (the default), all events are discarded. """, ).tag(config=True) redacted_policies = List( default_value=None, allow_none=True, - help=( - """ - A list of the redaction policies that will not be redacted - from incoming, recorded events. - """ - ), - ) + help="""A list of the redactionPolicies that will be redacted + from emitted events. + """, + ).tag(config=True) - schemas = Instance(SchemaRegistry) + schemas = Instance( + SchemaRegistry, + help="""The SchemaRegistry for caching validated schemas + and their jsonschema validators. + """, + ) @default("schemas") - def _default_schemas(self): + def _default_schemas(self) -> SchemaRegistry: return SchemaRegistry(redacted_policies=self.redacted_policies) def __init__(self, *args, **kwargs): @@ -79,14 +86,14 @@ def get_handlers(): eventlogger_cfg = Config({"EventLogger": my_cfg}) super()._load_config(eventlogger_cfg, section_names=None, traits=None) - def register_schema(self, schema): + def register_schema(self, schema: dict): """Register this schema with the schema registry. Get this registered schema using the EventLogger.schema.get() method. """ self.schemas.register(schema) - def register_schema_file(self, schema_file): + def register_schema_file(self, schema_file: str): """Register this schema with the schema registry. Get this registered schema using the EventLogger.schema.get() method. @@ -114,13 +121,13 @@ def _skip_message(record, **kwargs): if handler not in self.handlers: self.handlers.append(handler) - def remove_handler(self, handler): + def remove_handler(self, handler: logging.Handler): """Remove the logging handler from the logger and list of handlers.""" self.log.removeHandler(handler) if handler in self.handlers: self.handlers.remove(handler) - def emit(self, schema_name, version, event, timestamp_override=None): + def emit(self, id: str, version: int, event: dict, timestamp_override=None): """ Record given event with schema has occurred. @@ -140,7 +147,7 @@ def emit(self, schema_name, version, event, timestamp_override=None): dict The recorded event data """ - if not self.handlers or (schema_name, version) not in self.schemas: + if not self.handlers or (id, version) not in self.schemas: # if handler isn't set up or schema is not explicitly whitelisted, # don't do anything return @@ -152,12 +159,12 @@ def emit(self, schema_name, version, event, timestamp_override=None): timestamp = timestamp_override capsule = { "__timestamp__": timestamp.isoformat() + "Z", - "__schema__": schema_name, + "__schema__": id, "__schema_version__": version, "__metadata_version__": EVENTS_METADATA_VERSION, } # Process this event, i.e. validate and redact (in place) - self.schemas.process_event(schema_name, version, event) + self.schemas.process_event(id, version, event) capsule.update(event) self.log.info(capsule) return capsule diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py index 42b3209..8d4c70b 100644 --- a/jupyter_events/schema.py +++ b/jupyter_events/schema.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Hashable, List, Sequence, Union +from typing import Any, Dict, Hashable, List, Sequence, Tuple, Union from jsonschema import validators @@ -31,7 +31,7 @@ def _pop_nested_redacted_fields( return nested_data.pop(policy_location[-1]) -def _find_redaction_policies(schema: dict): +def _find_redaction_policies(schema: dict) -> Dict[str, list]: """A recursive function that iterates an event schema and returns a mapping of redaction policies to (nested) properties (identified by a sequence of keys). @@ -112,13 +112,15 @@ def __init__( self._validator = validator_class(schema, resolver=resolver) self._schema = schema - def _validate_redacted_policies(self, redacted_policies): + def _validate_redacted_policies( + self, redacted_policies: Union[None, List, str, set] + ) -> set: if redacted_policies is None: return set() value_type = type(redacted_policies) if value_type == str and redacted_policies == "all": return set(self.redaction_policies_locations.keys()) - if value_type == list: + if value_type == list or value_type == set: return set(redacted_policies) raise TypeError( "redacted_policies must be the literal string, 'all', or a list of " @@ -126,21 +128,21 @@ def _validate_redacted_policies(self, redacted_policies): ) @property - def id(self): + def id(self) -> str: """Schema $id field.""" return self._schema["$id"] @property - def version(self): + def version(self) -> int: """Schema's version.""" return self._schema["version"] @property - def registry_key(self): + def registry_key(self) -> Tuple[str, int]: return (self.id, self.version) @property - def redacted_policies(self): + def redacted_policies(self) -> set: """The redaction policies that will not be redacted when an incoming event is processed. """ diff --git a/jupyter_events/schema_registry.py b/jupyter_events/schema_registry.py index 95fca6b..1dd57ca 100644 --- a/jupyter_events/schema_registry.py +++ b/jupyter_events/schema_registry.py @@ -21,7 +21,7 @@ def redacted_policies(self) -> Optional[List[Any]]: """ return self._redacted_policies - def __contains__(self, registry_key: Tuple): + def __contains__(self, registry_key: Tuple[str, int]): """Syntax sugar to check if a schema is found in the registry""" return registry_key in self._schemas @@ -34,16 +34,16 @@ def _add(self, schema_obj: EventSchema): ) self._schemas[schema_obj.registry_key] = schema_obj - def register(self, schema_data): + def register(self, data: dict): """Add a valid schema to the registry. All schemas are validated against the Jupyter Events meta-schema found here: """ - schema = EventSchema(schema_data, redacted_policies=self.redacted_policies) + schema = EventSchema(data, redacted_policies=self.redacted_policies) self._add(schema) - def register_from_file(self, schema_filepath): + def register_from_file(self, schema_filepath: str): """Register a schema from a file.""" schema = EventSchema.from_file( schema_filepath, redacted_policies=self.redacted_policies From ea70c1d9ebff5a014ae7f9c56247320d727c9c71 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 21 Jul 2022 13:47:43 -0700 Subject: [PATCH 11/14] update readme --- README.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index bd1cd72..7b8909f 100644 --- a/README.md +++ b/README.md @@ -29,10 +29,6 @@ logger = EventLogger( handlers=[ logging.FileHandler('events.log') ], - # List schemas of events that should be recorded. - allowed_schemas=[ - 'uri.to.event.schema' - ] ) ``` @@ -49,14 +45,16 @@ Event schemas must be registered with the `EventLogger` for events to be recorde "title": "My Event", "description": "All events must have a name property.", "type": "object", + "redactionPolicies": ["unrestricted"], "properties": { - "name": { - "title": "Name", + "event_name": { + "title": "Event Name", "description": "Name of event", - "type": "string" + "type": "string", + "redactionPolicies": ["unrestricted"] } }, - "required": ["name"], + "required": ["event_name"], "version": 1 } ``` @@ -79,7 +77,7 @@ Events are recorded using the `record_event` method. This method validates the e ```python # Record an example event. -event = {'name': 'example event'} +event = {'event_name': 'example event'} logger.record_event( schema_id='url.to.event.schema', version=1, From 53398002820dbd741001a975feb09db9133ca99f Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 21 Jul 2022 13:51:13 -0700 Subject: [PATCH 12/14] precommit --- jupyter_events/validators.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/jupyter_events/validators.py b/jupyter_events/validators.py index 6f55ccd..e64d671 100644 --- a/jupyter_events/validators.py +++ b/jupyter_events/validators.py @@ -1,5 +1,4 @@ import pathlib -from xml.etree.ElementPath import prepare_self from jsonschema import RefResolver, ValidationError, validators @@ -27,8 +26,8 @@ def validate_schema(schema: dict): JUPYTER_EVENTS_VALIDATOR.validate(schema) except ValidationError as err: reserved_property_msg = " does not match '^(?!__.*)'" - if reserved_property_msg in err.message: - bad_property = err.message[: -(len(reserved_property_msg))] + if reserved_property_msg in str(err): + bad_property = str(err)[: -(len(reserved_property_msg))] raise ValidationError( f"{bad_property} is an invalid property name because it " "starts with `__`. Properties starting with 'dunder' " From b90ce6e8fe2aacab93c9e9960dc3a3739bb485f0 Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 28 Jul 2022 07:54:16 -0700 Subject: [PATCH 13/14] remove redacted policies --- jupyter_events/logger.py | 14 +- jupyter_events/schema.py | 120 ------------------ jupyter_events/schema_registry.py | 23 +--- jupyter_events/schemas/event-metaschema.yml | 5 - .../schemas/property-metaschema.yml | 5 - jupyter_events/traits.py | 34 ----- .../schemas/bad/missing-parent-policies.yaml | 17 --- tests/schemas/bad/missing-policy-array.yaml | 30 ----- ...ray.yaml => nested-reserved-property.yaml} | 19 +-- tests/schemas/bad/reserved-property.yaml | 4 - tests/schemas/good/array.yaml | 10 -- tests/schemas/good/basic.yaml | 4 - tests/schemas/good/nested-array.yaml | 16 --- tests/test_logger.py | 19 --- tests/test_redaction.py | 73 ----------- tests/test_schema.py | 11 +- tests/test_schema_registry.py | 0 tests/test_traits.py | 45 +------ 18 files changed, 10 insertions(+), 439 deletions(-) delete mode 100644 tests/schemas/bad/missing-parent-policies.yaml delete mode 100644 tests/schemas/bad/missing-policy-array.yaml rename tests/schemas/bad/{missing-policy-nested-array.yaml => nested-reserved-property.yaml} (55%) delete mode 100644 tests/test_redaction.py delete mode 100644 tests/test_schema_registry.py diff --git a/jupyter_events/logger.py b/jupyter_events/logger.py index 861c330..5392adc 100644 --- a/jupyter_events/logger.py +++ b/jupyter_events/logger.py @@ -6,7 +6,7 @@ from datetime import datetime from pythonjsonlogger import jsonlogger -from traitlets import Instance, List, default +from traitlets import Instance, default from traitlets.config import Config, Configurable from . import EVENTS_METADATA_VERSION @@ -33,14 +33,6 @@ class EventLogger(Configurable): """, ).tag(config=True) - redacted_policies = List( - default_value=None, - allow_none=True, - help="""A list of the redactionPolicies that will be redacted - from emitted events. - """, - ).tag(config=True) - schemas = Instance( SchemaRegistry, help="""The SchemaRegistry for caching validated schemas @@ -50,7 +42,7 @@ class EventLogger(Configurable): @default("schemas") def _default_schemas(self) -> SchemaRegistry: - return SchemaRegistry(redacted_policies=self.redacted_policies) + return SchemaRegistry() def __init__(self, *args, **kwargs): # We need to initialize the configurable before @@ -164,7 +156,7 @@ def emit(self, id: str, version: int, event: dict, timestamp_override=None): "__metadata_version__": EVENTS_METADATA_VERSION, } # Process this event, i.e. validate and redact (in place) - self.schemas.process_event(id, version, event) + self.schemas.validate_event(id, version, event) capsule.update(event) self.log.info(capsule) return capsule diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py index 8d4c70b..4be7c42 100644 --- a/jupyter_events/schema.py +++ b/jupyter_events/schema.py @@ -6,65 +6,6 @@ from .validators import validate_schema -def _pop_nested_redacted_fields( - schema_data: dict, policy_location: Sequence[Hashable] -) -> Any: - """Pop a item nested anywhere in a dwictionary using the - list of (hashable) keys to locate the item. - """ - # Begin walking the sequence of keys to the policy - # location given. - nested_data = schema_data - for i, el in enumerate(policy_location[:-1]): - # Handle arrays of objects. - if el == "__array__": - for j, _ in enumerate(nested_data): - branch = policy_location[i + 1 :] - _pop_nested_redacted_fields(nested_data[j], branch) - return - # Try moving into nested child schema. - try: - nested_data = nested_data[el] - except KeyError: - return - # If we made it this far, we ended on a policy that needs to be popped. - return nested_data.pop(policy_location[-1]) - - -def _find_redaction_policies(schema: dict) -> Dict[str, list]: - """A recursive function that iterates an event schema - and returns a mapping of redaction policies to - (nested) properties (identified by a sequence of keys). - """ - redaction_policies: Dict[str, List[str]] = {} - - def _extract_policies(subschema, key_sequence=()): - props = subschema["properties"] - for key, obj in props.items(): - updated_key_sequence = key_sequence + (key,) - - def _nested_extract_policies(obj, updated_key_sequence): - if isinstance(obj, dict): - if "properties" in obj: - _extract_policies(obj, updated_key_sequence) - if "items" in obj and "properties" in obj["items"]: - _nested_extract_policies( - obj["items"], updated_key_sequence + ("__array__",) - ) - - _nested_extract_policies(obj, updated_key_sequence) - - # Update the list in place. - for policy in obj["redactionPolicies"]: - policies_list = redaction_policies.get(policy, []) - policies_list.append(updated_key_sequence) - redaction_policies[policy] = policies_list - - # Start the recursion - _extract_policies(schema) - return redaction_policies - - class EventSchema: """A validated schema that can be used. @@ -85,15 +26,6 @@ class EventSchema: resolver: RefResolver for nested JSON schema references. - - allowed_policies: set - A set of redaction policied allowed by this event schema. - Each property in the schema must have a `redactionPolicy` - annotation representing the level of sensitivity of the - data collected by this event. In order for that data - to be emitted of Jupyter Events, the matching redaction - policy must be listed here. - """ def __init__( @@ -101,32 +33,13 @@ def __init__( schema, validator_class=validators.Draft7Validator, resolver=None, - redacted_policies: Union[str, list, None] = None, ): # Validate the schema against Jupyter Events metaschema. validate_schema(schema) - # Build a mapping of all property redaction policies. - self._redaction_policies_locations = _find_redaction_policies(schema) - self._redacted_policies = self._validate_redacted_policies(redacted_policies) # Create a validator for this schema self._validator = validator_class(schema, resolver=resolver) self._schema = schema - def _validate_redacted_policies( - self, redacted_policies: Union[None, List, str, set] - ) -> set: - if redacted_policies is None: - return set() - value_type = type(redacted_policies) - if value_type == str and redacted_policies == "all": - return set(self.redaction_policies_locations.keys()) - if value_type == list or value_type == set: - return set(redacted_policies) - raise TypeError( - "redacted_policies must be the literal string, 'all', or a list of " - "redaction polices" - ) - @property def id(self) -> str: """Schema $id field.""" @@ -141,53 +54,20 @@ def version(self) -> int: def registry_key(self) -> Tuple[str, int]: return (self.id, self.version) - @property - def redacted_policies(self) -> set: - """The redaction policies that will not be redacted when an - incoming event is processed. - """ - return self._redacted_policies - @classmethod def from_file( cls, filepath, validator_class=validators.Draft7Validator, resolver=None, - redacted_policies=None, ): schema = yaml.load(filepath) return cls( schema=schema, validator_class=validator_class, resolver=resolver, - redacted_policies=redacted_policies, ) - @property - def redaction_policies_locations(self) -> Dict[str, List[str]]: - """Mapping of the redaction policies in this schema to - the (nested) properties where they are defined. - """ - return self._redaction_policies_locations - def validate(self, data: dict) -> None: """Validate an incoming instance of this event schema.""" self._validator.validate(data) - - def enforce_redaction_policies(self, data: dict) -> None: - """Redact fields from""" - # # Find all policies not explicitly allowed. - # named_policies = set(self.redaction_policies_locations.keys()) - # redacted_policies = named_policies - self.unredacted_policies - for policy_type in self.redacted_policies: - policy_locations = self._redaction_policies_locations[policy_type] - for item in policy_locations: - _pop_nested_redacted_fields(data, item) - - def process(self, data: dict) -> None: - """Validate event data and enforce an redaction policies (in place). - Nothing is returned by this method, because the data is redacted in place. - """ - self.validate(data) - self.enforce_redaction_policies(data) diff --git a/jupyter_events/schema_registry.py b/jupyter_events/schema_registry.py index 1dd57ca..471aa83 100644 --- a/jupyter_events/schema_registry.py +++ b/jupyter_events/schema_registry.py @@ -10,16 +10,8 @@ class SchemaRegistryException(Exception): class SchemaRegistry: """A convenient API for storing and searching a group of schemas.""" - def __init__(self, schemas: dict = None, redacted_policies: list = None): + def __init__(self, schemas: dict = None): self._schemas = schemas or {} - self._redacted_policies = redacted_policies - - @property - def redacted_policies(self) -> Optional[List[Any]]: - """A list of policies that will be redacted from - all events validated against this registry. - """ - return self._redacted_policies def __contains__(self, registry_key: Tuple[str, int]): """Syntax sugar to check if a schema is found in the registry""" @@ -40,14 +32,12 @@ def register(self, data: dict): All schemas are validated against the Jupyter Events meta-schema found here: """ - schema = EventSchema(data, redacted_policies=self.redacted_policies) + schema = EventSchema(data) self._add(schema) def register_from_file(self, schema_filepath: str): """Register a schema from a file.""" - schema = EventSchema.from_file( - schema_filepath, redacted_policies=self.redacted_policies - ) + schema = EventSchema.from_file(schema_filepath) self._add(schema) def get(self, id: str, version: int) -> EventSchema: @@ -82,10 +72,3 @@ def validate_event(self, id: str, version: int, data: dict) -> None: """ schema = self.get(id, version) schema.validate(data) - - def process_event(self, id: str, version: int, data: dict) -> None: - """Validate and event and enforce an redaction policies (in place). - Nothing is returned by this method, because the data is redacted in place. - """ - schema = self.get(id, version) - schema.process(data) diff --git a/jupyter_events/schemas/event-metaschema.yml b/jupyter_events/schemas/event-metaschema.yml index 9bbd7e5..913e572 100644 --- a/jupyter_events/schemas/event-metaschema.yml +++ b/jupyter_events/schemas/event-metaschema.yml @@ -13,10 +13,6 @@ properties: type: string description: type: string - redactionPolicies: - type: array - items: - type: string properties: type: object additionalProperties: @@ -27,5 +23,4 @@ properties: required: - $id - version - - redactionPolicies - properties diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml index f2f58e3..176a711 100644 --- a/jupyter_events/schemas/property-metaschema.yml +++ b/jupyter_events/schemas/property-metaschema.yml @@ -11,10 +11,6 @@ properties: type: string description: type: string - redactionPolicies: - type: array - items: - type: string properties: type: object additionalProperties: @@ -27,7 +23,6 @@ properties: required: - title - - redactionPolicies additionalProperties: $ref: http://event.jupyter.org/property-metaschema diff --git a/jupyter_events/traits.py b/jupyter_events/traits.py index 848bd4b..020320c 100644 --- a/jupyter_events/traits.py +++ b/jupyter_events/traits.py @@ -42,37 +42,3 @@ def validate(self, obj, value): return value else: self.error(obj, value) - - -class SchemaOptions(TraitType): - """A trait for handling options for recording schemas.""" - - info_text = "either a dictionary with schema options or a list with schema names." - - def validate(self, obj, val): - # If the type is a dictionary. - if type(val) is dict: - for schema_name, data in val.items(): - given_keys = set(data.keys()) - # Compare against keys expected. - allowed_keys = {"allowed_categories", "allowed_properties"} - # There should be no extra keys (anything other than - # allowed_keys) in the schema options. - unknown_keys = given_keys.difference(allowed_keys) - if unknown_keys: - # Throw an error if there are unknown keys. - raise TraitError( - "The schema option, {schema_name}, includes " - "unknown key(s): {unknown_keys}".format( - schema_name=schema_name, unknown_keys=",".join(unknown_keys) - ) - ) - validated_val = val - # If the type is a list (for backwards compatibility). - elif type(val) is list: - validated_val = {} - for schema_name in val: - validated_val[schema_name] = {} - else: - raise TraitError("SchemaOptions must be of type dict or list.") - return validated_val diff --git a/tests/schemas/bad/missing-parent-policies.yaml b/tests/schemas/bad/missing-parent-policies.yaml deleted file mode 100644 index bbf0c8b..0000000 --- a/tests/schemas/bad/missing-parent-policies.yaml +++ /dev/null @@ -1,17 +0,0 @@ -$id: http://event.jupyter.org/test-simple -version: 1 -title: Simple Test Schema -description: | - Fails validation because the root level of this schema - is missing redactionPolicies. -type: object -properties: - prop1: - title: Test Property 1 - description: | - Test property 1. - redactionPolicies: - - unrestricted - type: string -required: - - prop1 diff --git a/tests/schemas/bad/missing-policy-array.yaml b/tests/schemas/bad/missing-policy-array.yaml deleted file mode 100644 index a6897f9..0000000 --- a/tests/schemas/bad/missing-policy-array.yaml +++ /dev/null @@ -1,30 +0,0 @@ -$id: http://event.jupyter.org/test -version: 1 -title: Schema with Array -description: | - A schema for an array of objects. -type: object -redactionPolicies: - - unrestricted -properties: - users: - title: Test User Array - description: | - Test User array. - redactionPolicies: - - unrestricted - type: array - items: - type: object - title: User - redactionPolicies: - - unrestricted - properties: - email: - type: string - title: Email - id: - type: string - title: Name - redactionPolicies: - - user-identifier diff --git a/tests/schemas/bad/missing-policy-nested-array.yaml b/tests/schemas/bad/nested-reserved-property.yaml similarity index 55% rename from tests/schemas/bad/missing-policy-nested-array.yaml rename to tests/schemas/bad/nested-reserved-property.yaml index e76b4ad..4ea004d 100644 --- a/tests/schemas/bad/missing-policy-nested-array.yaml +++ b/tests/schemas/bad/nested-reserved-property.yaml @@ -2,46 +2,31 @@ $id: http://event.jupyter.org/test version: 1 title: Schema with Array description: | - Fails validation because an element in the nested array - doesn't have a redactionPolicies field (see "position"). + A schema for an array of objects. type: object -redactionPolicies: - - unrestricted properties: users: title: Test User Array description: | Test User array. - redactionPolicies: - - unrestricted type: array items: type: object title: User - redactionPolicies: - - unrestricted properties: name: type: string title: Name - redactionPolicies: - - user-identifier hobbies: type: array title: Hobbies - redactionPolicies: - - unrestricted items: type: object title: Hobby - redactionPolicies: - - unrestricted properties: - sport: + __badName: title: Sport Name type: string - redactionPolicies: - - unrestricted position: title: Position type: string diff --git a/tests/schemas/bad/reserved-property.yaml b/tests/schemas/bad/reserved-property.yaml index d543823..919a937 100644 --- a/tests/schemas/bad/reserved-property.yaml +++ b/tests/schemas/bad/reserved-property.yaml @@ -4,12 +4,8 @@ title: Simple Test Schema description: | A simple schema for testing type: object -redactionPolicies: - - unrestricted properties: __badName: title: Test Property description: Test property. - redactionPolicies: - - unrestricted type: string diff --git a/tests/schemas/good/array.yaml b/tests/schemas/good/array.yaml index 336af90..a917374 100644 --- a/tests/schemas/good/array.yaml +++ b/tests/schemas/good/array.yaml @@ -4,29 +4,19 @@ title: Schema with Array description: | A schema for an array of objects. type: object -redactionPolicies: - - unrestricted properties: users: title: Test User Array description: | Test User array. - redactionPolicies: - - unrestricted type: array items: type: object title: User - redactionPolicies: - - unrestricted properties: email: type: string title: Email - redactionPolicies: - - user-identifiable-information id: type: string title: Name - redactionPolicies: - - user-identifier diff --git a/tests/schemas/good/basic.yaml b/tests/schemas/good/basic.yaml index b33b45e..33a73e4 100644 --- a/tests/schemas/good/basic.yaml +++ b/tests/schemas/good/basic.yaml @@ -4,12 +4,8 @@ title: Simple Test Schema description: | A simple schema for testing type: object -redactionPolicies: - - unrestricted properties: prop: title: Test Property description: Test property. - redactionPolicies: - - unrestricted type: string diff --git a/tests/schemas/good/nested-array.yaml b/tests/schemas/good/nested-array.yaml index 61bc86d..f54c3cf 100644 --- a/tests/schemas/good/nested-array.yaml +++ b/tests/schemas/good/nested-array.yaml @@ -4,45 +4,29 @@ title: Schema with Array description: | A schema for an array of objects. type: object -redactionPolicies: - - unrestricted properties: users: title: Test User Array description: | Test User array. - redactionPolicies: - - unrestricted type: array items: type: object title: User - redactionPolicies: - - unrestricted properties: name: type: string title: Name - redactionPolicies: - - user-identifier hobbies: type: array title: Hobbies - redactionPolicies: - - unrestricted items: type: object title: Hobby - redactionPolicies: - - unrestricted properties: sport: title: Sport Name type: string - redactionPolicies: - - unrestricted position: title: Position type: string - redactionPolicies: - - user-identifiable-information diff --git a/tests/test_logger.py b/tests/test_logger.py index a05fc09..603ea37 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -99,12 +99,10 @@ def test_timestamp_override(): schema = { "$id": "test/test", "version": 1, - "redactionPolicies": ["unrestricted"], "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } @@ -130,12 +128,10 @@ def test_emit(): schema = { "$id": "test/test", "version": 1, - "redactionPolicies": ["unrestricted"], "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } @@ -174,13 +170,11 @@ def test_register_schema_file(tmp_path): schema = { "$id": "test/test", "version": 1, - "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } @@ -199,13 +193,11 @@ def test_register_schema_file_object(tmp_path): schema = { "$id": "test/test", "version": 1, - "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } @@ -225,18 +217,15 @@ def test_emit_badschema(): schema = { "$id": "test/test", "version": 1, - "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, "status": { "enum": ["success", "failure"], "title": "test 2", - "redactionPolicies": ["unrestricted"], }, }, } @@ -253,13 +242,11 @@ def test_unique_logger_instances(): schema0 = { "$id": "test/test0", "version": 1, - "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } @@ -267,13 +254,11 @@ def test_unique_logger_instances(): schema1 = { "$id": "test/test1", "version": 1, - "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } @@ -337,13 +322,11 @@ def test_register_duplicate_schemas(): schema0 = { "$id": "test/test", "version": 1, - "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } @@ -351,13 +334,11 @@ def test_register_duplicate_schemas(): schema1 = { "$id": "test/test", "version": 1, - "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", - "redactionPolicies": ["unrestricted"], }, }, } diff --git a/tests/test_redaction.py b/tests/test_redaction.py deleted file mode 100644 index 61ea086..0000000 --- a/tests/test_redaction.py +++ /dev/null @@ -1,73 +0,0 @@ -import pathlib - -import pytest - -from jupyter_events.schema import EventSchema - -SCHEMA_PATH = pathlib.Path(__file__).parent / "schemas" - - -@pytest.mark.parametrize( - "schema_file,redacted_policies,data,data_out", - [ - [ - # Schema name - "array.yaml", - # Redacted policies - ["user-identifier", "user-identifiable-information"], - # Unredacted data - { - "nothing-exciting": "hello, world", - "users": [ - {"id": "test id 0", "email": "test0@testemail.com"}, - {"id": "test id 1", "email": "test1@testemail.com"}, - ], - }, - # Redacted data - { - "nothing-exciting": "hello, world", - "users": [{}, {}], - }, - ], - [ - # Schema name - "nested-array.yaml", - # Redacted policies - ["user-identifier", "user-identifiable-information"], - # Unredacted data - { - "nothing-exciting": "hello, world", - "users": [ - { - "name": "Alice", - "hobbies": [ - {"sport": "basketball", "position": "guard"}, - {"sport": "soccer", "position": "striker"}, - ], - }, - { - "name": "Bob", - "hobbies": [ - {"sport": "basketball", "position": "center"}, - {"sport": "soccer", "position": "goalie"}, - ], - }, - ], - }, - # Redacted data - { - "nothing-exciting": "hello, world", - "users": [ - {"hobbies": [{"sport": "basketball"}, {"sport": "soccer"}]}, - {"hobbies": [{"sport": "basketball"}, {"sport": "soccer"}]}, - ], - }, - ], - ], -) -def test_redaction_in_arrays(schema_file, redacted_policies, data, data_out): - schema = EventSchema.from_file( - SCHEMA_PATH / "good" / schema_file, redacted_policies=redacted_policies - ) - schema.enforce_redaction_policies(data) - assert data == data_out diff --git a/tests/test_schema.py b/tests/test_schema.py index 830c57d..098a890 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -6,18 +6,9 @@ from .utils import SCHEMA_PATH -MISSING_REDACTION_POLICY = "'redactionPolicies' is a required property" - BAD_SCHEMAS = [ - [ - # Bad schema file. - "missing-parent-policies.yaml", - # The expected valdation error message. - MISSING_REDACTION_POLICY, - ], - ["missing-policy-array.yaml", MISSING_REDACTION_POLICY], - ["missing-policy-nested-array.yaml", MISSING_REDACTION_POLICY], ["reserved-property.yaml", "Properties starting with 'dunder'"], + ["nested-reserved-property.yaml", "Properties starting with 'dunder'"], ] diff --git a/tests/test_schema_registry.py b/tests/test_schema_registry.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_traits.py b/tests/test_traits.py index 1740f18..a6878f2 100644 --- a/tests/test_traits.py +++ b/tests/test_traits.py @@ -3,7 +3,7 @@ import pytest from traitlets import HasTraits, TraitError -from jupyter_events.traits import Handlers, SchemaOptions +from jupyter_events.traits import Handlers class HasHandlers(HasTraits): @@ -27,46 +27,3 @@ def test_mixed_handlers_values(): handlers = [logging.NullHandler(), 1] with pytest.raises(TraitError): HasHandlers(handlers=handlers) - - -class HasSchemaOptions(HasTraits): - schema_options = SchemaOptions({}, allow_none=True) - - -@pytest.mark.parametrize( - "schema_options", - [ - # schema_options can be a list of schema_names. In this case, - # the SchemaOptions trait will turn this list into a dictionary - # with the list items as keys the values as empty dictionaries. - ["schema_name_1", "schema_name_2"], - # Empty nested config are okay. - {"schema_name_1": {}}, - # Nested config with empty values is okay too. - {"schema_name_1": {"allowed_categories": []}}, - # Test complete config for good measure. - {"schema_name_1": {"allowed_categories": ["value"]}}, - # Test multiple values. - {"schema_name_1": {"allowed_categories": ["value"]}, "schema_name_2": {}}, - ], -) -def test_good_schema_options(schema_options): - obj = HasSchemaOptions(schema_options=schema_options) - assert type(obj.schema_options) == dict - - -@pytest.mark.parametrize( - "schema_options", - [ - # Raise an error if Schema Options has unknown attribute. - {"schema_name_1": {"unknown_attribute": []}}, - # Test multiple values. - { - "schema_name_1": {"allowed_categories": ["value"]}, - "schema_name_2": {"unknown_attribute": []}, - }, - ], -) -def test_bad_schema_options(schema_options): - with pytest.raises(TraitError): - HasSchemaOptions(schema_options=schema_options) From 386e971ee73ea49eaccbf858fecf25426105cfaf Mon Sep 17 00:00:00 2001 From: Zach Sailer Date: Thu, 28 Jul 2022 07:56:27 -0700 Subject: [PATCH 14/14] Revert "remove redacted policies" This reverts commit b90ce6e8fe2aacab93c9e9960dc3a3739bb485f0. --- jupyter_events/logger.py | 14 +- jupyter_events/schema.py | 120 ++++++++++++++++++ jupyter_events/schema_registry.py | 23 +++- jupyter_events/schemas/event-metaschema.yml | 5 + .../schemas/property-metaschema.yml | 5 + jupyter_events/traits.py | 34 +++++ .../schemas/bad/missing-parent-policies.yaml | 17 +++ tests/schemas/bad/missing-policy-array.yaml | 30 +++++ ....yaml => missing-policy-nested-array.yaml} | 19 ++- tests/schemas/bad/reserved-property.yaml | 4 + tests/schemas/good/array.yaml | 10 ++ tests/schemas/good/basic.yaml | 4 + tests/schemas/good/nested-array.yaml | 16 +++ tests/test_logger.py | 19 +++ tests/test_redaction.py | 73 +++++++++++ tests/test_schema.py | 11 +- tests/test_schema_registry.py | 0 tests/test_traits.py | 45 ++++++- 18 files changed, 439 insertions(+), 10 deletions(-) create mode 100644 tests/schemas/bad/missing-parent-policies.yaml create mode 100644 tests/schemas/bad/missing-policy-array.yaml rename tests/schemas/bad/{nested-reserved-property.yaml => missing-policy-nested-array.yaml} (55%) create mode 100644 tests/test_redaction.py create mode 100644 tests/test_schema_registry.py diff --git a/jupyter_events/logger.py b/jupyter_events/logger.py index 5392adc..861c330 100644 --- a/jupyter_events/logger.py +++ b/jupyter_events/logger.py @@ -6,7 +6,7 @@ from datetime import datetime from pythonjsonlogger import jsonlogger -from traitlets import Instance, default +from traitlets import Instance, List, default from traitlets.config import Config, Configurable from . import EVENTS_METADATA_VERSION @@ -33,6 +33,14 @@ class EventLogger(Configurable): """, ).tag(config=True) + redacted_policies = List( + default_value=None, + allow_none=True, + help="""A list of the redactionPolicies that will be redacted + from emitted events. + """, + ).tag(config=True) + schemas = Instance( SchemaRegistry, help="""The SchemaRegistry for caching validated schemas @@ -42,7 +50,7 @@ class EventLogger(Configurable): @default("schemas") def _default_schemas(self) -> SchemaRegistry: - return SchemaRegistry() + return SchemaRegistry(redacted_policies=self.redacted_policies) def __init__(self, *args, **kwargs): # We need to initialize the configurable before @@ -156,7 +164,7 @@ def emit(self, id: str, version: int, event: dict, timestamp_override=None): "__metadata_version__": EVENTS_METADATA_VERSION, } # Process this event, i.e. validate and redact (in place) - self.schemas.validate_event(id, version, event) + self.schemas.process_event(id, version, event) capsule.update(event) self.log.info(capsule) return capsule diff --git a/jupyter_events/schema.py b/jupyter_events/schema.py index 4be7c42..8d4c70b 100644 --- a/jupyter_events/schema.py +++ b/jupyter_events/schema.py @@ -6,6 +6,65 @@ from .validators import validate_schema +def _pop_nested_redacted_fields( + schema_data: dict, policy_location: Sequence[Hashable] +) -> Any: + """Pop a item nested anywhere in a dwictionary using the + list of (hashable) keys to locate the item. + """ + # Begin walking the sequence of keys to the policy + # location given. + nested_data = schema_data + for i, el in enumerate(policy_location[:-1]): + # Handle arrays of objects. + if el == "__array__": + for j, _ in enumerate(nested_data): + branch = policy_location[i + 1 :] + _pop_nested_redacted_fields(nested_data[j], branch) + return + # Try moving into nested child schema. + try: + nested_data = nested_data[el] + except KeyError: + return + # If we made it this far, we ended on a policy that needs to be popped. + return nested_data.pop(policy_location[-1]) + + +def _find_redaction_policies(schema: dict) -> Dict[str, list]: + """A recursive function that iterates an event schema + and returns a mapping of redaction policies to + (nested) properties (identified by a sequence of keys). + """ + redaction_policies: Dict[str, List[str]] = {} + + def _extract_policies(subschema, key_sequence=()): + props = subschema["properties"] + for key, obj in props.items(): + updated_key_sequence = key_sequence + (key,) + + def _nested_extract_policies(obj, updated_key_sequence): + if isinstance(obj, dict): + if "properties" in obj: + _extract_policies(obj, updated_key_sequence) + if "items" in obj and "properties" in obj["items"]: + _nested_extract_policies( + obj["items"], updated_key_sequence + ("__array__",) + ) + + _nested_extract_policies(obj, updated_key_sequence) + + # Update the list in place. + for policy in obj["redactionPolicies"]: + policies_list = redaction_policies.get(policy, []) + policies_list.append(updated_key_sequence) + redaction_policies[policy] = policies_list + + # Start the recursion + _extract_policies(schema) + return redaction_policies + + class EventSchema: """A validated schema that can be used. @@ -26,6 +85,15 @@ class EventSchema: resolver: RefResolver for nested JSON schema references. + + allowed_policies: set + A set of redaction policied allowed by this event schema. + Each property in the schema must have a `redactionPolicy` + annotation representing the level of sensitivity of the + data collected by this event. In order for that data + to be emitted of Jupyter Events, the matching redaction + policy must be listed here. + """ def __init__( @@ -33,13 +101,32 @@ def __init__( schema, validator_class=validators.Draft7Validator, resolver=None, + redacted_policies: Union[str, list, None] = None, ): # Validate the schema against Jupyter Events metaschema. validate_schema(schema) + # Build a mapping of all property redaction policies. + self._redaction_policies_locations = _find_redaction_policies(schema) + self._redacted_policies = self._validate_redacted_policies(redacted_policies) # Create a validator for this schema self._validator = validator_class(schema, resolver=resolver) self._schema = schema + def _validate_redacted_policies( + self, redacted_policies: Union[None, List, str, set] + ) -> set: + if redacted_policies is None: + return set() + value_type = type(redacted_policies) + if value_type == str and redacted_policies == "all": + return set(self.redaction_policies_locations.keys()) + if value_type == list or value_type == set: + return set(redacted_policies) + raise TypeError( + "redacted_policies must be the literal string, 'all', or a list of " + "redaction polices" + ) + @property def id(self) -> str: """Schema $id field.""" @@ -54,20 +141,53 @@ def version(self) -> int: def registry_key(self) -> Tuple[str, int]: return (self.id, self.version) + @property + def redacted_policies(self) -> set: + """The redaction policies that will not be redacted when an + incoming event is processed. + """ + return self._redacted_policies + @classmethod def from_file( cls, filepath, validator_class=validators.Draft7Validator, resolver=None, + redacted_policies=None, ): schema = yaml.load(filepath) return cls( schema=schema, validator_class=validator_class, resolver=resolver, + redacted_policies=redacted_policies, ) + @property + def redaction_policies_locations(self) -> Dict[str, List[str]]: + """Mapping of the redaction policies in this schema to + the (nested) properties where they are defined. + """ + return self._redaction_policies_locations + def validate(self, data: dict) -> None: """Validate an incoming instance of this event schema.""" self._validator.validate(data) + + def enforce_redaction_policies(self, data: dict) -> None: + """Redact fields from""" + # # Find all policies not explicitly allowed. + # named_policies = set(self.redaction_policies_locations.keys()) + # redacted_policies = named_policies - self.unredacted_policies + for policy_type in self.redacted_policies: + policy_locations = self._redaction_policies_locations[policy_type] + for item in policy_locations: + _pop_nested_redacted_fields(data, item) + + def process(self, data: dict) -> None: + """Validate event data and enforce an redaction policies (in place). + Nothing is returned by this method, because the data is redacted in place. + """ + self.validate(data) + self.enforce_redaction_policies(data) diff --git a/jupyter_events/schema_registry.py b/jupyter_events/schema_registry.py index 471aa83..1dd57ca 100644 --- a/jupyter_events/schema_registry.py +++ b/jupyter_events/schema_registry.py @@ -10,8 +10,16 @@ class SchemaRegistryException(Exception): class SchemaRegistry: """A convenient API for storing and searching a group of schemas.""" - def __init__(self, schemas: dict = None): + def __init__(self, schemas: dict = None, redacted_policies: list = None): self._schemas = schemas or {} + self._redacted_policies = redacted_policies + + @property + def redacted_policies(self) -> Optional[List[Any]]: + """A list of policies that will be redacted from + all events validated against this registry. + """ + return self._redacted_policies def __contains__(self, registry_key: Tuple[str, int]): """Syntax sugar to check if a schema is found in the registry""" @@ -32,12 +40,14 @@ def register(self, data: dict): All schemas are validated against the Jupyter Events meta-schema found here: """ - schema = EventSchema(data) + schema = EventSchema(data, redacted_policies=self.redacted_policies) self._add(schema) def register_from_file(self, schema_filepath: str): """Register a schema from a file.""" - schema = EventSchema.from_file(schema_filepath) + schema = EventSchema.from_file( + schema_filepath, redacted_policies=self.redacted_policies + ) self._add(schema) def get(self, id: str, version: int) -> EventSchema: @@ -72,3 +82,10 @@ def validate_event(self, id: str, version: int, data: dict) -> None: """ schema = self.get(id, version) schema.validate(data) + + def process_event(self, id: str, version: int, data: dict) -> None: + """Validate and event and enforce an redaction policies (in place). + Nothing is returned by this method, because the data is redacted in place. + """ + schema = self.get(id, version) + schema.process(data) diff --git a/jupyter_events/schemas/event-metaschema.yml b/jupyter_events/schemas/event-metaschema.yml index 913e572..9bbd7e5 100644 --- a/jupyter_events/schemas/event-metaschema.yml +++ b/jupyter_events/schemas/event-metaschema.yml @@ -13,6 +13,10 @@ properties: type: string description: type: string + redactionPolicies: + type: array + items: + type: string properties: type: object additionalProperties: @@ -23,4 +27,5 @@ properties: required: - $id - version + - redactionPolicies - properties diff --git a/jupyter_events/schemas/property-metaschema.yml b/jupyter_events/schemas/property-metaschema.yml index 176a711..f2f58e3 100644 --- a/jupyter_events/schemas/property-metaschema.yml +++ b/jupyter_events/schemas/property-metaschema.yml @@ -11,6 +11,10 @@ properties: type: string description: type: string + redactionPolicies: + type: array + items: + type: string properties: type: object additionalProperties: @@ -23,6 +27,7 @@ properties: required: - title + - redactionPolicies additionalProperties: $ref: http://event.jupyter.org/property-metaschema diff --git a/jupyter_events/traits.py b/jupyter_events/traits.py index 020320c..848bd4b 100644 --- a/jupyter_events/traits.py +++ b/jupyter_events/traits.py @@ -42,3 +42,37 @@ def validate(self, obj, value): return value else: self.error(obj, value) + + +class SchemaOptions(TraitType): + """A trait for handling options for recording schemas.""" + + info_text = "either a dictionary with schema options or a list with schema names." + + def validate(self, obj, val): + # If the type is a dictionary. + if type(val) is dict: + for schema_name, data in val.items(): + given_keys = set(data.keys()) + # Compare against keys expected. + allowed_keys = {"allowed_categories", "allowed_properties"} + # There should be no extra keys (anything other than + # allowed_keys) in the schema options. + unknown_keys = given_keys.difference(allowed_keys) + if unknown_keys: + # Throw an error if there are unknown keys. + raise TraitError( + "The schema option, {schema_name}, includes " + "unknown key(s): {unknown_keys}".format( + schema_name=schema_name, unknown_keys=",".join(unknown_keys) + ) + ) + validated_val = val + # If the type is a list (for backwards compatibility). + elif type(val) is list: + validated_val = {} + for schema_name in val: + validated_val[schema_name] = {} + else: + raise TraitError("SchemaOptions must be of type dict or list.") + return validated_val diff --git a/tests/schemas/bad/missing-parent-policies.yaml b/tests/schemas/bad/missing-parent-policies.yaml new file mode 100644 index 0000000..bbf0c8b --- /dev/null +++ b/tests/schemas/bad/missing-parent-policies.yaml @@ -0,0 +1,17 @@ +$id: http://event.jupyter.org/test-simple +version: 1 +title: Simple Test Schema +description: | + Fails validation because the root level of this schema + is missing redactionPolicies. +type: object +properties: + prop1: + title: Test Property 1 + description: | + Test property 1. + redactionPolicies: + - unrestricted + type: string +required: + - prop1 diff --git a/tests/schemas/bad/missing-policy-array.yaml b/tests/schemas/bad/missing-policy-array.yaml new file mode 100644 index 0000000..a6897f9 --- /dev/null +++ b/tests/schemas/bad/missing-policy-array.yaml @@ -0,0 +1,30 @@ +$id: http://event.jupyter.org/test +version: 1 +title: Schema with Array +description: | + A schema for an array of objects. +type: object +redactionPolicies: + - unrestricted +properties: + users: + title: Test User Array + description: | + Test User array. + redactionPolicies: + - unrestricted + type: array + items: + type: object + title: User + redactionPolicies: + - unrestricted + properties: + email: + type: string + title: Email + id: + type: string + title: Name + redactionPolicies: + - user-identifier diff --git a/tests/schemas/bad/nested-reserved-property.yaml b/tests/schemas/bad/missing-policy-nested-array.yaml similarity index 55% rename from tests/schemas/bad/nested-reserved-property.yaml rename to tests/schemas/bad/missing-policy-nested-array.yaml index 4ea004d..e76b4ad 100644 --- a/tests/schemas/bad/nested-reserved-property.yaml +++ b/tests/schemas/bad/missing-policy-nested-array.yaml @@ -2,31 +2,46 @@ $id: http://event.jupyter.org/test version: 1 title: Schema with Array description: | - A schema for an array of objects. + Fails validation because an element in the nested array + doesn't have a redactionPolicies field (see "position"). type: object +redactionPolicies: + - unrestricted properties: users: title: Test User Array description: | Test User array. + redactionPolicies: + - unrestricted type: array items: type: object title: User + redactionPolicies: + - unrestricted properties: name: type: string title: Name + redactionPolicies: + - user-identifier hobbies: type: array title: Hobbies + redactionPolicies: + - unrestricted items: type: object title: Hobby + redactionPolicies: + - unrestricted properties: - __badName: + sport: title: Sport Name type: string + redactionPolicies: + - unrestricted position: title: Position type: string diff --git a/tests/schemas/bad/reserved-property.yaml b/tests/schemas/bad/reserved-property.yaml index 919a937..d543823 100644 --- a/tests/schemas/bad/reserved-property.yaml +++ b/tests/schemas/bad/reserved-property.yaml @@ -4,8 +4,12 @@ title: Simple Test Schema description: | A simple schema for testing type: object +redactionPolicies: + - unrestricted properties: __badName: title: Test Property description: Test property. + redactionPolicies: + - unrestricted type: string diff --git a/tests/schemas/good/array.yaml b/tests/schemas/good/array.yaml index a917374..336af90 100644 --- a/tests/schemas/good/array.yaml +++ b/tests/schemas/good/array.yaml @@ -4,19 +4,29 @@ title: Schema with Array description: | A schema for an array of objects. type: object +redactionPolicies: + - unrestricted properties: users: title: Test User Array description: | Test User array. + redactionPolicies: + - unrestricted type: array items: type: object title: User + redactionPolicies: + - unrestricted properties: email: type: string title: Email + redactionPolicies: + - user-identifiable-information id: type: string title: Name + redactionPolicies: + - user-identifier diff --git a/tests/schemas/good/basic.yaml b/tests/schemas/good/basic.yaml index 33a73e4..b33b45e 100644 --- a/tests/schemas/good/basic.yaml +++ b/tests/schemas/good/basic.yaml @@ -4,8 +4,12 @@ title: Simple Test Schema description: | A simple schema for testing type: object +redactionPolicies: + - unrestricted properties: prop: title: Test Property description: Test property. + redactionPolicies: + - unrestricted type: string diff --git a/tests/schemas/good/nested-array.yaml b/tests/schemas/good/nested-array.yaml index f54c3cf..61bc86d 100644 --- a/tests/schemas/good/nested-array.yaml +++ b/tests/schemas/good/nested-array.yaml @@ -4,29 +4,45 @@ title: Schema with Array description: | A schema for an array of objects. type: object +redactionPolicies: + - unrestricted properties: users: title: Test User Array description: | Test User array. + redactionPolicies: + - unrestricted type: array items: type: object title: User + redactionPolicies: + - unrestricted properties: name: type: string title: Name + redactionPolicies: + - user-identifier hobbies: type: array title: Hobbies + redactionPolicies: + - unrestricted items: type: object title: Hobby + redactionPolicies: + - unrestricted properties: sport: title: Sport Name type: string + redactionPolicies: + - unrestricted position: title: Position type: string + redactionPolicies: + - user-identifiable-information diff --git a/tests/test_logger.py b/tests/test_logger.py index 603ea37..a05fc09 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -99,10 +99,12 @@ def test_timestamp_override(): schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } @@ -128,10 +130,12 @@ def test_emit(): schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } @@ -170,11 +174,13 @@ def test_register_schema_file(tmp_path): schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } @@ -193,11 +199,13 @@ def test_register_schema_file_object(tmp_path): schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } @@ -217,15 +225,18 @@ def test_emit_badschema(): schema = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, "status": { "enum": ["success", "failure"], "title": "test 2", + "redactionPolicies": ["unrestricted"], }, }, } @@ -242,11 +253,13 @@ def test_unique_logger_instances(): schema0 = { "$id": "test/test0", "version": 1, + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } @@ -254,11 +267,13 @@ def test_unique_logger_instances(): schema1 = { "$id": "test/test1", "version": 1, + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } @@ -322,11 +337,13 @@ def test_register_duplicate_schemas(): schema0 = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } @@ -334,11 +351,13 @@ def test_register_duplicate_schemas(): schema1 = { "$id": "test/test", "version": 1, + "redactionPolicies": ["unrestricted"], "type": "object", "properties": { "something": { "type": "string", "title": "test", + "redactionPolicies": ["unrestricted"], }, }, } diff --git a/tests/test_redaction.py b/tests/test_redaction.py new file mode 100644 index 0000000..61ea086 --- /dev/null +++ b/tests/test_redaction.py @@ -0,0 +1,73 @@ +import pathlib + +import pytest + +from jupyter_events.schema import EventSchema + +SCHEMA_PATH = pathlib.Path(__file__).parent / "schemas" + + +@pytest.mark.parametrize( + "schema_file,redacted_policies,data,data_out", + [ + [ + # Schema name + "array.yaml", + # Redacted policies + ["user-identifier", "user-identifiable-information"], + # Unredacted data + { + "nothing-exciting": "hello, world", + "users": [ + {"id": "test id 0", "email": "test0@testemail.com"}, + {"id": "test id 1", "email": "test1@testemail.com"}, + ], + }, + # Redacted data + { + "nothing-exciting": "hello, world", + "users": [{}, {}], + }, + ], + [ + # Schema name + "nested-array.yaml", + # Redacted policies + ["user-identifier", "user-identifiable-information"], + # Unredacted data + { + "nothing-exciting": "hello, world", + "users": [ + { + "name": "Alice", + "hobbies": [ + {"sport": "basketball", "position": "guard"}, + {"sport": "soccer", "position": "striker"}, + ], + }, + { + "name": "Bob", + "hobbies": [ + {"sport": "basketball", "position": "center"}, + {"sport": "soccer", "position": "goalie"}, + ], + }, + ], + }, + # Redacted data + { + "nothing-exciting": "hello, world", + "users": [ + {"hobbies": [{"sport": "basketball"}, {"sport": "soccer"}]}, + {"hobbies": [{"sport": "basketball"}, {"sport": "soccer"}]}, + ], + }, + ], + ], +) +def test_redaction_in_arrays(schema_file, redacted_policies, data, data_out): + schema = EventSchema.from_file( + SCHEMA_PATH / "good" / schema_file, redacted_policies=redacted_policies + ) + schema.enforce_redaction_policies(data) + assert data == data_out diff --git a/tests/test_schema.py b/tests/test_schema.py index 098a890..830c57d 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -6,9 +6,18 @@ from .utils import SCHEMA_PATH +MISSING_REDACTION_POLICY = "'redactionPolicies' is a required property" + BAD_SCHEMAS = [ + [ + # Bad schema file. + "missing-parent-policies.yaml", + # The expected valdation error message. + MISSING_REDACTION_POLICY, + ], + ["missing-policy-array.yaml", MISSING_REDACTION_POLICY], + ["missing-policy-nested-array.yaml", MISSING_REDACTION_POLICY], ["reserved-property.yaml", "Properties starting with 'dunder'"], - ["nested-reserved-property.yaml", "Properties starting with 'dunder'"], ] diff --git a/tests/test_schema_registry.py b/tests/test_schema_registry.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_traits.py b/tests/test_traits.py index a6878f2..1740f18 100644 --- a/tests/test_traits.py +++ b/tests/test_traits.py @@ -3,7 +3,7 @@ import pytest from traitlets import HasTraits, TraitError -from jupyter_events.traits import Handlers +from jupyter_events.traits import Handlers, SchemaOptions class HasHandlers(HasTraits): @@ -27,3 +27,46 @@ def test_mixed_handlers_values(): handlers = [logging.NullHandler(), 1] with pytest.raises(TraitError): HasHandlers(handlers=handlers) + + +class HasSchemaOptions(HasTraits): + schema_options = SchemaOptions({}, allow_none=True) + + +@pytest.mark.parametrize( + "schema_options", + [ + # schema_options can be a list of schema_names. In this case, + # the SchemaOptions trait will turn this list into a dictionary + # with the list items as keys the values as empty dictionaries. + ["schema_name_1", "schema_name_2"], + # Empty nested config are okay. + {"schema_name_1": {}}, + # Nested config with empty values is okay too. + {"schema_name_1": {"allowed_categories": []}}, + # Test complete config for good measure. + {"schema_name_1": {"allowed_categories": ["value"]}}, + # Test multiple values. + {"schema_name_1": {"allowed_categories": ["value"]}, "schema_name_2": {}}, + ], +) +def test_good_schema_options(schema_options): + obj = HasSchemaOptions(schema_options=schema_options) + assert type(obj.schema_options) == dict + + +@pytest.mark.parametrize( + "schema_options", + [ + # Raise an error if Schema Options has unknown attribute. + {"schema_name_1": {"unknown_attribute": []}}, + # Test multiple values. + { + "schema_name_1": {"allowed_categories": ["value"]}, + "schema_name_2": {"unknown_attribute": []}, + }, + ], +) +def test_bad_schema_options(schema_options): + with pytest.raises(TraitError): + HasSchemaOptions(schema_options=schema_options)