From 01dd303a59d5ffee41cd433171e15f10fb196fc6 Mon Sep 17 00:00:00 2001 From: Peter Amstutz Date: Fri, 30 Sep 2016 10:56:50 -0400 Subject: [PATCH] Optimize validate (#60) Validation can now either fail fast (raise_ex=False) or fail with explanation (raise_ex=True). Improve validation performance by using fail fast when checking unions. Optimize record validation by performing explicit check for "class" first. Improve validation error reporting for unions when one of the types is an exact match for class by suppressing errors for other types in the union. Other changes: * Use unicode strings more consistently. * makedoc applies table styling * Add version constraint to lockfile package dependency --- schema_salad/makedoc.py | 6 ++ schema_salad/schema.py | 48 +++++++---- schema_salad/validate.py | 178 +++++++++++++++++++++++++++------------ setup.py | 4 +- 4 files changed, 165 insertions(+), 71 deletions(-) diff --git a/schema_salad/makedoc.py b/schema_salad/makedoc.py index 7cdc1162..5303c818 100644 --- a/schema_salad/makedoc.py +++ b/schema_salad/makedoc.py @@ -49,6 +49,12 @@ def __init__(self): # type: () -> None def header(self, text, level, raw=None): return """%s""" % (level, to_id(text), text, level) + def table(self, header, body): + return ( + '\n%s\n' + '\n%s\n
\n' + ) % (header, body) + def to_id(text): # type: (Union[str, unicode]) -> Union[str, unicode] textid = text diff --git a/schema_salad/schema.py b/schema_salad/schema.py index 11a227b5..f3172892 100644 --- a/schema_salad/schema.py +++ b/schema_salad/schema.py @@ -233,35 +233,49 @@ def validate_doc(schema_names, doc, loader, strict): else: raise validate.ValidationException("Document must be dict or list") + roots = [] + for r in schema_names.names.values(): + if ((hasattr(r, "get_prop") and r.get_prop(u"documentRoot")) or ( + r.props.get(u"documentRoot"))): + roots.append(r) + anyerrors = [] for pos, item in enumerate(validate_doc): - errors = [] success = False - for r in schema_names.names.values(): - if ((hasattr(r, "get_prop") and r.get_prop(u"documentRoot")) or ( - u"documentRoot" in r.props)): + for r in roots: + success = validate.validate_ex( + r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=False) + if success: + break + + if not success: + errors = [] # type: List[unicode] + for r in roots: + if hasattr(r, "get_prop"): + name = r.get_prop(u"name") + elif hasattr(r, "name"): + name = r.name + try: validate.validate_ex( - r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties) - success = True + r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=True) + except validate.ClassValidationException as e: + errors = [u"Could not validate `%s` because\n%s" % ( + name, validate.indent(str(e), nolead=False))] break except validate.ValidationException as e: - if hasattr(r, "get_prop"): - name = r.get_prop(u"name") - elif hasattr(r, "name"): - name = r.name - errors.append("Could not validate as `%s` because\n%s" % ( + errors.append(u"Could not validate as `%s` because\n%s" % ( name, validate.indent(str(e), nolead=False))) - if not success: - objerr = "Validation error at position %i" % pos + + objerr = u"Validation error at position %i" % pos for ident in loader.identifiers: if ident in item: - objerr = "Validation error in object %s" % (item[ident]) + objerr = u"Validation error in object %s" % (item[ident]) break - anyerrors.append("%s\n%s" % - (objerr, validate.indent("\n".join(errors)))) + anyerrors.append(u"%s\n%s" % + (objerr, validate.indent(u"\n".join(errors)))) if anyerrors: - raise validate.ValidationException("\n".join(anyerrors)) + raise validate.ValidationException(u"\n".join(anyerrors)) def replace_type(items, spec, loader, found): diff --git a/schema_salad/validate.py b/schema_salad/validate.py index 90b02e97..5cb9032e 100644 --- a/schema_salad/validate.py +++ b/schema_salad/validate.py @@ -2,28 +2,28 @@ import avro.schema import sys import urlparse -from typing import Any +from typing import Any, Union class ValidationException(Exception): pass +class ClassValidationException(ValidationException): + pass + def validate(expected_schema, datum, identifiers=set(), strict=False, foreign_properties=set()): # type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode]) -> bool - try: - return validate_ex(expected_schema, datum, identifiers, strict=strict, foreign_properties=foreign_properties) - except ValidationException: - return False + return validate_ex(expected_schema, datum, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=False) INT_MIN_VALUE = -(1 << 31) INT_MAX_VALUE = (1 << 31) - 1 LONG_MIN_VALUE = -(1 << 63) LONG_MAX_VALUE = (1 << 63) - 1 -def indent(v, nolead=False): # type: (str, bool) -> str +def indent(v, nolead=False): # type: (Union[str, unicode], bool) -> unicode if nolead: - return v.splitlines()[0] + "\n".join([" " + l for l in v.splitlines()[1:]]) + return v.splitlines()[0] + u"\n".join([u" " + l for l in v.splitlines()[1:]]) else: - return "\n".join([" " + l for l in v.splitlines()]) + return u"\n".join([" " + l for l in v.splitlines()]) def friendly(v): # type: (Any) -> Any if isinstance(v, avro.schema.NamedSchema): @@ -37,11 +37,11 @@ def friendly(v): # type: (Any) -> Any else: return v -def multi(v, q=""): # type: (str, str) -> str +def multi(v, q=""): # type: (Union[str, unicode], Union[str, unicode]) -> unicode if '\n' in v: - return "%s%s%s\n" % (q, v, q) + return u"%s%s%s\n" % (q, v, q) else: - return "%s%s%s" % (q, v, q) + return u"%s%s%s" % (q, v, q) def vpformat(datum): # type: (Any) -> str a = pprint.pformat(datum) @@ -50,8 +50,8 @@ def vpformat(datum): # type: (Any) -> str return a def validate_ex(expected_schema, datum, identifiers=None, strict=False, - foreign_properties=None): - # type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode]) -> bool + foreign_properties=None, raise_ex=True): + # type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode], bool) -> bool """Determine if a python datum is an instance of a schema.""" if not identifiers: @@ -66,93 +66,154 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False, if datum is None: return True else: - raise ValidationException("the value `%s` is not null" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"the value `%s` is not null" % vpformat(datum)) + else: + return False elif schema_type == 'boolean': if isinstance(datum, bool): return True else: - raise ValidationException("the value `%s` is not boolean" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"the value `%s` is not boolean" % vpformat(datum)) + else: + return False elif schema_type == 'string': if isinstance(datum, basestring): return True elif isinstance(datum, bytes): - datum = datum.decode("utf-8") + datum = datum.decode(u"utf-8") return True else: - raise ValidationException("the value `%s` is not string" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"the value `%s` is not string" % vpformat(datum)) + else: + return False elif schema_type == 'bytes': if isinstance(datum, str): return True else: - raise ValidationException("the value `%s` is not bytes" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"the value `%s` is not bytes" % vpformat(datum)) + else: + return False elif schema_type == 'int': if ((isinstance(datum, int) or isinstance(datum, long)) and INT_MIN_VALUE <= datum <= INT_MAX_VALUE): return True else: - raise ValidationException("`%s` is not int" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"`%s` is not int" % vpformat(datum)) + else: + return False elif schema_type == 'long': if ((isinstance(datum, int) or isinstance(datum, long)) and LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE): return True else: - raise ValidationException("the value `%s` is not long" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"the value `%s` is not long" % vpformat(datum)) + else: + return False elif schema_type in ['float', 'double']: if (isinstance(datum, int) or isinstance(datum, long) or isinstance(datum, float)): return True else: - raise ValidationException("the value `%s` is not float or double" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"the value `%s` is not float or double" % vpformat(datum)) + else: + return False elif isinstance(expected_schema, avro.schema.FixedSchema): if isinstance(datum, str) and len(datum) == expected_schema.size: return True else: - raise ValidationException("the value `%s` is not fixed" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"the value `%s` is not fixed" % vpformat(datum)) + else: + return False elif isinstance(expected_schema, avro.schema.EnumSchema): if expected_schema.name == "Any": if datum is not None: return True else: - raise ValidationException("Any type must be non-null") + if raise_ex: + raise ValidationException(u"'Any' type must be non-null") + else: + return False if datum in expected_schema.symbols: return True else: - raise ValidationException("the value `%s`\n is not a valid symbol in enum %s, expected one of %s" % (vpformat(datum), expected_schema.name, "'" + "', '".join(expected_schema.symbols) + "'")) + if raise_ex: + raise ValidationException(u"the value `%s`\n is not a valid symbol in enum %s, expected one of %s" % (vpformat(datum), expected_schema.name, "'" + "', '".join(expected_schema.symbols) + "'")) + else: + return False elif isinstance(expected_schema, avro.schema.ArraySchema): if isinstance(datum, list): for i, d in enumerate(datum): try: - validate_ex(expected_schema.items, d, identifiers, strict=strict, foreign_properties=foreign_properties) + if not validate_ex(expected_schema.items, d, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex): + return False except ValidationException as v: - raise ValidationException("At position %i\n%s" % (i, indent(str(v)))) - return True - else: - raise ValidationException("the value `%s` is not a list, expected list of %s" % (vpformat(datum), friendly(expected_schema.items))) - elif isinstance(expected_schema, avro.schema.MapSchema): - if (isinstance(datum, dict) and - False not in [isinstance(k, basestring) for k in datum.keys()] and - False not in [validate(expected_schema.values, v, strict=strict) for v in datum.values()]): + if raise_ex: + raise ValidationException(u"At position %i\n%s" % (i, indent(str(v)))) + else: + return False return True else: - raise ValidationException("`%s` is not a valid map value, expected\n %s" % (vpformat(datum), vpformat(expected_schema.values))) + if raise_ex: + raise ValidationException(u"the value `%s` is not a list, expected list of %s" % (vpformat(datum), friendly(expected_schema.items))) + else: + return False elif isinstance(expected_schema, avro.schema.UnionSchema): - if True in [validate(s, datum, identifiers, strict=strict) for s in expected_schema.schemas]: - return True - else: - errors = [] - for s in expected_schema.schemas: - try: - validate_ex(s, datum, identifiers, strict=strict, foreign_properties=foreign_properties) - except ValidationException as e: - errors.append(str(e)) - raise ValidationException("the value %s is not a valid type in the union, expected one of:\n%s" % (multi(vpformat(datum), '`'), "\n".join(["- %s, but\n %s" % (friendly(expected_schema.schemas[i]), indent(multi(errors[i]))) for i in range(0, len(expected_schema.schemas))]))) + for s in expected_schema.schemas: + if validate_ex(s, datum, identifiers, strict=strict, raise_ex=False): + return True + + if not raise_ex: + return False + + errors = [] # type: List[unicode] + for s in expected_schema.schemas: + try: + validate_ex(s, datum, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=True) + except ClassValidationException as e: + raise + except ValidationException as e: + errors.append(unicode(e)) + + raise ValidationException(u"the value %s is not a valid type in the union, expected one of:\n%s" % ( + multi(vpformat(datum), '`'), u"\n".join([ + u"- %s, but\n %s" % ( + friendly(expected_schema.schemas[i]), indent(multi(errors[i]))) + for i in range(0, len(expected_schema.schemas))]))) elif isinstance(expected_schema, avro.schema.RecordSchema): if not isinstance(datum, dict): - raise ValidationException("`%s`\n is not a dict" % vpformat(datum)) + if raise_ex: + raise ValidationException(u"`%s`\n is not a dict" % vpformat(datum)) + else: + return False + + classmatch = None + for f in expected_schema.fields: + if f.name == "class": + d = datum.get("class") + if not d: + if raise_ex: + raise ValidationException(u"Missing 'class' field") + else: + return False + if not validate_ex(f.type, d, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex): + return False + classmatch = d + break errors = [] for f in expected_schema.fields: + if f.name == "class": + continue + if f.name in datum: fieldval = datum[f.name] else: @@ -162,12 +223,14 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False, fieldval = None try: - validate_ex(f.type, fieldval, identifiers, strict=strict, foreign_properties=foreign_properties) + if not validate_ex(f.type, fieldval, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex): + return False except ValidationException as v: if f.name not in datum: - errors.append("missing required field `%s`" % f.name) + errors.append(u"missing required field `%s`" % f.name) else: - errors.append("could not validate field `%s` because\n%s" % (f.name, multi(indent(str(v))))) + errors.append(u"could not validate field `%s` because\n%s" % (f.name, multi(indent(str(v))))) + if strict: for d in datum: found = False @@ -176,14 +239,25 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False, found = True if not found: if d not in identifiers and d not in foreign_properties and d[0] not in ("@", "$"): + if not raise_ex: + return False split = urlparse.urlsplit(d) if split.scheme: - errors.append("could not validate extension field `%s` because it is not recognized and strict is True. Did you include a $schemas section?" % (d)) + errors.append(u"could not validate extension field `%s` because it is not recognized and strict is True. Did you include a $schemas section?" % (d)) else: - errors.append("could not validate field `%s` because it is not recognized and strict is True, valid fields are: %s" % (d, ", ".join(fn.name for fn in expected_schema.fields))) + errors.append(u"could not validate field `%s` because it is not recognized and strict is True, valid fields are: %s" % (d, ", ".join(fn.name for fn in expected_schema.fields))) if errors: - raise ValidationException("\n".join(errors)) + if raise_ex: + if classmatch: + raise ClassValidationException(u"%s record %s" % (classmatch, "\n".join(errors))) + else: + raise ValidationException(u"\n".join(errors)) + else: + return False else: return True - raise ValidationException("Unrecognized schema_type %s" % schema_type) + if raise_ex: + raise ValidationException(u"Unrecognized schema_type %s" % schema_type) + else: + return False diff --git a/setup.py b/setup.py index a6773096..fa639a9d 100755 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ 'mistune', 'typing >= 3.5.2', 'CacheControl', - 'lockfile'] + 'lockfile >= 0.9'] install_requires.append("avro") # TODO: remove me once cwltool is # available in Debian Stable, Ubuntu 12.04 LTS @@ -46,7 +46,7 @@ extras_require = {} # TODO: to be removed when the above is added setup(name='schema-salad', - version='1.17', + version='1.18', description='Schema Annotations for Linked Avro Data (SALAD)', long_description=open(README).read(), author='Common workflow language working group',