Skip to content

Commit

Permalink
Optimize validate (#60)
Browse files Browse the repository at this point in the history
Validation can now either fail fast (raise_ex=False) or fail with
explanation (raise_ex=True).  Improve validation performance by using fail fast
when checking unions.  Optimize record validation by performing explicit check
for "class" first.  Improve validation error reporting for unions when one of
the types is an exact match for class by suppressing errors for other types in
the union.

Other changes:
* Use unicode strings more consistently.
* makedoc applies table styling
* Add version constraint to lockfile package dependency
  • Loading branch information
tetron authored Sep 30, 2016
1 parent 423e48b commit 01dd303
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 71 deletions.
6 changes: 6 additions & 0 deletions schema_salad/makedoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ def __init__(self): # type: () -> None
def header(self, text, level, raw=None):
return """<h%i id="%s">%s</h%i>""" % (level, to_id(text), text, level)

def table(self, header, body):
return (
'<table class="table table-striped">\n<thead>%s</thead>\n'
'<tbody>\n%s</tbody>\n</table>\n'
) % (header, body)


def to_id(text): # type: (Union[str, unicode]) -> Union[str, unicode]
textid = text
Expand Down
48 changes: 31 additions & 17 deletions schema_salad/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,35 +233,49 @@ def validate_doc(schema_names, doc, loader, strict):
else:
raise validate.ValidationException("Document must be dict or list")

roots = []
for r in schema_names.names.values():
if ((hasattr(r, "get_prop") and r.get_prop(u"documentRoot")) or (
r.props.get(u"documentRoot"))):
roots.append(r)

anyerrors = []
for pos, item in enumerate(validate_doc):
errors = []
success = False
for r in schema_names.names.values():
if ((hasattr(r, "get_prop") and r.get_prop(u"documentRoot")) or (
u"documentRoot" in r.props)):
for r in roots:
success = validate.validate_ex(
r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=False)
if success:
break

if not success:
errors = [] # type: List[unicode]
for r in roots:
if hasattr(r, "get_prop"):
name = r.get_prop(u"name")
elif hasattr(r, "name"):
name = r.name

try:
validate.validate_ex(
r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties)
success = True
r, item, loader.identifiers, strict, foreign_properties=loader.foreign_properties, raise_ex=True)
except validate.ClassValidationException as e:
errors = [u"Could not validate `%s` because\n%s" % (
name, validate.indent(str(e), nolead=False))]
break
except validate.ValidationException as e:
if hasattr(r, "get_prop"):
name = r.get_prop(u"name")
elif hasattr(r, "name"):
name = r.name
errors.append("Could not validate as `%s` because\n%s" % (
errors.append(u"Could not validate as `%s` because\n%s" % (
name, validate.indent(str(e), nolead=False)))
if not success:
objerr = "Validation error at position %i" % pos

objerr = u"Validation error at position %i" % pos
for ident in loader.identifiers:
if ident in item:
objerr = "Validation error in object %s" % (item[ident])
objerr = u"Validation error in object %s" % (item[ident])
break
anyerrors.append("%s\n%s" %
(objerr, validate.indent("\n".join(errors))))
anyerrors.append(u"%s\n%s" %
(objerr, validate.indent(u"\n".join(errors))))
if anyerrors:
raise validate.ValidationException("\n".join(anyerrors))
raise validate.ValidationException(u"\n".join(anyerrors))


def replace_type(items, spec, loader, found):
Expand Down
178 changes: 126 additions & 52 deletions schema_salad/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,28 @@
import avro.schema
import sys
import urlparse
from typing import Any
from typing import Any, Union

class ValidationException(Exception):
pass

class ClassValidationException(ValidationException):
pass

def validate(expected_schema, datum, identifiers=set(), strict=False, foreign_properties=set()):
# type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode]) -> bool
try:
return validate_ex(expected_schema, datum, identifiers, strict=strict, foreign_properties=foreign_properties)
except ValidationException:
return False
return validate_ex(expected_schema, datum, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=False)

INT_MIN_VALUE = -(1 << 31)
INT_MAX_VALUE = (1 << 31) - 1
LONG_MIN_VALUE = -(1 << 63)
LONG_MAX_VALUE = (1 << 63) - 1

def indent(v, nolead=False): # type: (str, bool) -> str
def indent(v, nolead=False): # type: (Union[str, unicode], bool) -> unicode
if nolead:
return v.splitlines()[0] + "\n".join([" " + l for l in v.splitlines()[1:]])
return v.splitlines()[0] + u"\n".join([u" " + l for l in v.splitlines()[1:]])
else:
return "\n".join([" " + l for l in v.splitlines()])
return u"\n".join([" " + l for l in v.splitlines()])

def friendly(v): # type: (Any) -> Any
if isinstance(v, avro.schema.NamedSchema):
Expand All @@ -37,11 +37,11 @@ def friendly(v): # type: (Any) -> Any
else:
return v

def multi(v, q=""): # type: (str, str) -> str
def multi(v, q=""): # type: (Union[str, unicode], Union[str, unicode]) -> unicode
if '\n' in v:
return "%s%s%s\n" % (q, v, q)
return u"%s%s%s\n" % (q, v, q)
else:
return "%s%s%s" % (q, v, q)
return u"%s%s%s" % (q, v, q)

def vpformat(datum): # type: (Any) -> str
a = pprint.pformat(datum)
Expand All @@ -50,8 +50,8 @@ def vpformat(datum): # type: (Any) -> str
return a

def validate_ex(expected_schema, datum, identifiers=None, strict=False,
foreign_properties=None):
# type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode]) -> bool
foreign_properties=None, raise_ex=True):
# type: (avro.schema.Schema, Any, Set[unicode], bool, Set[unicode], bool) -> bool
"""Determine if a python datum is an instance of a schema."""

if not identifiers:
Expand All @@ -66,93 +66,154 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False,
if datum is None:
return True
else:
raise ValidationException("the value `%s` is not null" % vpformat(datum))
if raise_ex:
raise ValidationException(u"the value `%s` is not null" % vpformat(datum))
else:
return False
elif schema_type == 'boolean':
if isinstance(datum, bool):
return True
else:
raise ValidationException("the value `%s` is not boolean" % vpformat(datum))
if raise_ex:
raise ValidationException(u"the value `%s` is not boolean" % vpformat(datum))
else:
return False
elif schema_type == 'string':
if isinstance(datum, basestring):
return True
elif isinstance(datum, bytes):
datum = datum.decode("utf-8")
datum = datum.decode(u"utf-8")
return True
else:
raise ValidationException("the value `%s` is not string" % vpformat(datum))
if raise_ex:
raise ValidationException(u"the value `%s` is not string" % vpformat(datum))
else:
return False
elif schema_type == 'bytes':
if isinstance(datum, str):
return True
else:
raise ValidationException("the value `%s` is not bytes" % vpformat(datum))
if raise_ex:
raise ValidationException(u"the value `%s` is not bytes" % vpformat(datum))
else:
return False
elif schema_type == 'int':
if ((isinstance(datum, int) or isinstance(datum, long))
and INT_MIN_VALUE <= datum <= INT_MAX_VALUE):
return True
else:
raise ValidationException("`%s` is not int" % vpformat(datum))
if raise_ex:
raise ValidationException(u"`%s` is not int" % vpformat(datum))
else:
return False
elif schema_type == 'long':
if ((isinstance(datum, int) or isinstance(datum, long))
and LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE):
return True
else:
raise ValidationException("the value `%s` is not long" % vpformat(datum))
if raise_ex:
raise ValidationException(u"the value `%s` is not long" % vpformat(datum))
else:
return False
elif schema_type in ['float', 'double']:
if (isinstance(datum, int) or isinstance(datum, long)
or isinstance(datum, float)):
return True
else:
raise ValidationException("the value `%s` is not float or double" % vpformat(datum))
if raise_ex:
raise ValidationException(u"the value `%s` is not float or double" % vpformat(datum))
else:
return False
elif isinstance(expected_schema, avro.schema.FixedSchema):
if isinstance(datum, str) and len(datum) == expected_schema.size:
return True
else:
raise ValidationException("the value `%s` is not fixed" % vpformat(datum))
if raise_ex:
raise ValidationException(u"the value `%s` is not fixed" % vpformat(datum))
else:
return False
elif isinstance(expected_schema, avro.schema.EnumSchema):
if expected_schema.name == "Any":
if datum is not None:
return True
else:
raise ValidationException("Any type must be non-null")
if raise_ex:
raise ValidationException(u"'Any' type must be non-null")
else:
return False
if datum in expected_schema.symbols:
return True
else:
raise ValidationException("the value `%s`\n is not a valid symbol in enum %s, expected one of %s" % (vpformat(datum), expected_schema.name, "'" + "', '".join(expected_schema.symbols) + "'"))
if raise_ex:
raise ValidationException(u"the value `%s`\n is not a valid symbol in enum %s, expected one of %s" % (vpformat(datum), expected_schema.name, "'" + "', '".join(expected_schema.symbols) + "'"))
else:
return False
elif isinstance(expected_schema, avro.schema.ArraySchema):
if isinstance(datum, list):
for i, d in enumerate(datum):
try:
validate_ex(expected_schema.items, d, identifiers, strict=strict, foreign_properties=foreign_properties)
if not validate_ex(expected_schema.items, d, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex):
return False
except ValidationException as v:
raise ValidationException("At position %i\n%s" % (i, indent(str(v))))
return True
else:
raise ValidationException("the value `%s` is not a list, expected list of %s" % (vpformat(datum), friendly(expected_schema.items)))
elif isinstance(expected_schema, avro.schema.MapSchema):
if (isinstance(datum, dict) and
False not in [isinstance(k, basestring) for k in datum.keys()] and
False not in [validate(expected_schema.values, v, strict=strict) for v in datum.values()]):
if raise_ex:
raise ValidationException(u"At position %i\n%s" % (i, indent(str(v))))
else:
return False
return True
else:
raise ValidationException("`%s` is not a valid map value, expected\n %s" % (vpformat(datum), vpformat(expected_schema.values)))
if raise_ex:
raise ValidationException(u"the value `%s` is not a list, expected list of %s" % (vpformat(datum), friendly(expected_schema.items)))
else:
return False
elif isinstance(expected_schema, avro.schema.UnionSchema):
if True in [validate(s, datum, identifiers, strict=strict) for s in expected_schema.schemas]:
return True
else:
errors = []
for s in expected_schema.schemas:
try:
validate_ex(s, datum, identifiers, strict=strict, foreign_properties=foreign_properties)
except ValidationException as e:
errors.append(str(e))
raise ValidationException("the value %s is not a valid type in the union, expected one of:\n%s" % (multi(vpformat(datum), '`'), "\n".join(["- %s, but\n %s" % (friendly(expected_schema.schemas[i]), indent(multi(errors[i]))) for i in range(0, len(expected_schema.schemas))])))
for s in expected_schema.schemas:
if validate_ex(s, datum, identifiers, strict=strict, raise_ex=False):
return True

if not raise_ex:
return False

errors = [] # type: List[unicode]
for s in expected_schema.schemas:
try:
validate_ex(s, datum, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=True)
except ClassValidationException as e:
raise
except ValidationException as e:
errors.append(unicode(e))

raise ValidationException(u"the value %s is not a valid type in the union, expected one of:\n%s" % (
multi(vpformat(datum), '`'), u"\n".join([
u"- %s, but\n %s" % (
friendly(expected_schema.schemas[i]), indent(multi(errors[i])))
for i in range(0, len(expected_schema.schemas))])))

elif isinstance(expected_schema, avro.schema.RecordSchema):
if not isinstance(datum, dict):
raise ValidationException("`%s`\n is not a dict" % vpformat(datum))
if raise_ex:
raise ValidationException(u"`%s`\n is not a dict" % vpformat(datum))
else:
return False

classmatch = None
for f in expected_schema.fields:
if f.name == "class":
d = datum.get("class")
if not d:
if raise_ex:
raise ValidationException(u"Missing 'class' field")
else:
return False
if not validate_ex(f.type, d, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex):
return False
classmatch = d
break

errors = []
for f in expected_schema.fields:
if f.name == "class":
continue

if f.name in datum:
fieldval = datum[f.name]
else:
Expand All @@ -162,12 +223,14 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False,
fieldval = None

try:
validate_ex(f.type, fieldval, identifiers, strict=strict, foreign_properties=foreign_properties)
if not validate_ex(f.type, fieldval, identifiers, strict=strict, foreign_properties=foreign_properties, raise_ex=raise_ex):
return False
except ValidationException as v:
if f.name not in datum:
errors.append("missing required field `%s`" % f.name)
errors.append(u"missing required field `%s`" % f.name)
else:
errors.append("could not validate field `%s` because\n%s" % (f.name, multi(indent(str(v)))))
errors.append(u"could not validate field `%s` because\n%s" % (f.name, multi(indent(str(v)))))

if strict:
for d in datum:
found = False
Expand All @@ -176,14 +239,25 @@ def validate_ex(expected_schema, datum, identifiers=None, strict=False,
found = True
if not found:
if d not in identifiers and d not in foreign_properties and d[0] not in ("@", "$"):
if not raise_ex:
return False
split = urlparse.urlsplit(d)
if split.scheme:
errors.append("could not validate extension field `%s` because it is not recognized and strict is True. Did you include a $schemas section?" % (d))
errors.append(u"could not validate extension field `%s` because it is not recognized and strict is True. Did you include a $schemas section?" % (d))
else:
errors.append("could not validate field `%s` because it is not recognized and strict is True, valid fields are: %s" % (d, ", ".join(fn.name for fn in expected_schema.fields)))
errors.append(u"could not validate field `%s` because it is not recognized and strict is True, valid fields are: %s" % (d, ", ".join(fn.name for fn in expected_schema.fields)))

if errors:
raise ValidationException("\n".join(errors))
if raise_ex:
if classmatch:
raise ClassValidationException(u"%s record %s" % (classmatch, "\n".join(errors)))
else:
raise ValidationException(u"\n".join(errors))
else:
return False
else:
return True
raise ValidationException("Unrecognized schema_type %s" % schema_type)
if raise_ex:
raise ValidationException(u"Unrecognized schema_type %s" % schema_type)
else:
return False
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
'mistune',
'typing >= 3.5.2',
'CacheControl',
'lockfile']
'lockfile >= 0.9']

install_requires.append("avro") # TODO: remove me once cwltool is
# available in Debian Stable, Ubuntu 12.04 LTS
Expand All @@ -46,7 +46,7 @@
extras_require = {} # TODO: to be removed when the above is added

setup(name='schema-salad',
version='1.17',
version='1.18',
description='Schema Annotations for Linked Avro Data (SALAD)',
long_description=open(README).read(),
author='Common workflow language working group',
Expand Down

0 comments on commit 01dd303

Please sign in to comment.