diff --git a/zschema/__main__.py b/zschema/__main__.py index 321a35f..558622b 100644 --- a/zschema/__main__.py +++ b/zschema/__main__.py @@ -11,8 +11,9 @@ def usage(): sys.stderr.write("USAGE: %s command schema [file].\n" % sys.argv[0].split("/")[-1]) - sys.stderr.write("Valid commands: bigquery, elasticsearch, json, text, html, censys-html, flat, validate.\n") - sys.stderr.write("schema should be defined as file.py:record\n") + sys.stderr.write("Valid commands: bigquery, elasticsearch, docs-es, docs-bq, json, flat, validate.\n") + sys.stderr.write("Schema should be passed as file.py:record\n") + sys.stderr.write("The optional 'file' argument is used only as the test file for the 'validate' command.\n") sys.stderr.write("VERSION: %s\n" % zschema.__version__) sys.exit(1) @@ -27,26 +28,15 @@ def main(): print json.dumps(record.to_bigquery()) elif command == "elasticsearch": print json.dumps(record.to_es(recname)) + elif command == "docs-es": + print json.dumps(record.docs_es(recname)) + elif command == "docs-bq": + print json.dumps(record.docs_bq(recname)) elif command == "json": print record.to_json() - elif command == "html": - for r in record.to_flat(): - type_ = r.get("es_type", "") - print "%s%s" % (r["name"], type_) - elif command == "text": - print record.to_text() elif command == "flat": for r in record.to_flat(): print json.dumps(r) - elif command == "censys-html": - for r in record.to_flat(): - type_ = r.get("es_type", None) - len_ = r["name"].count(".") - style = 'style="padding-left: %ipx"' % (15 * len_ + 5) - if not type_: - print '%s%s' % (style, r["name"], "") - else: - print "%s%s" % (style, r["name"], type_) elif command == "validate": if not os.path.exists(sys.argv[3]): sys.stderr.write("Invalid test file. %s does not exist.\n" % sys.argv[3]) diff --git a/zschema/compounds.py b/zschema/compounds.py index d107b60..cf3a245 100644 --- a/zschema/compounds.py +++ b/zschema/compounds.py @@ -10,9 +10,11 @@ def _is_valid_object(name, object_): class ListOf(Keyable): - def __init__(self, object_, max_items=10): + def __init__(self, object_, max_items=10, doc=None, category=None): self.object_ = object_ self.max_items = max_items + self.category = category + self.doc = doc _is_valid_object("Anonymous ListOf", object_) @property @@ -33,9 +35,27 @@ def to_bigquery(self, name): retv["mode"] = "REPEATED" return retv + def docs_bq(self, parent_category=None): + retv = self.object_.docs_bq() + category = self.category or parent_category + retv["category"] = category + retv["repeated"] = True + if self.doc: + retv["doc"] = self.doc + return retv + def to_es(self): return self.object_.to_es() + def docs_es(self, parent_category=None): + retv = self.object_.docs_es() + category = self.category or parent_category + retv["category"] = category + retv["repeated"] = True + if self.doc: + retv["doc"] = self.doc + return retv + def validate(self, name, value): if type(value) != list: raise DataValidationException("%s: %s is not a list", @@ -59,11 +79,13 @@ def __init__(self, doc=None, extends=None, allow_unknown=False, - exclude=None): + exclude=None, + category=None): self.definition = definition self.required = required self.allow_unknown = allow_unknown self.doc = doc + self.category = category self._exclude = set(exclude) if exclude else set([]) # merge if extends: @@ -113,14 +135,25 @@ def merge(self, other): return self def to_bigquery(self, name): - fields = [v.to_bigquery(k) for (k,v) in sorted(self.definition.iteritems()) if \ - not v.exclude_bigquery] - return { + fields = [v.to_bigquery(k) \ + for (k,v) in sorted(self.definition.iteritems()) \ + if not v.exclude_bigquery + ] + retv = { "name":self.key_to_bq(name), "type":"RECORD", "fields":fields, "mode":"REQUIRED" if self.required else "NULLABLE" } + return retv + + def docs_bq(self, parent_category=None): + retv = self._docs_common(parent_category=parent_category) + fields = { self.key_to_bq(k): v.docs_bq() \ + for (k,v) in sorted(self.definition.iteritems()) \ + if not v.exclude_bigquery } + retv["fields"] = fields + return retv def print_indent_string(self, name, indent): tabs = "\t" * indent if indent else "" @@ -129,10 +162,28 @@ def print_indent_string(self, name, indent): value.print_indent_string(name, indent+1) def to_es(self): - p = {self.key_to_es(k): v.to_es() for k, v in sorted(self.definition.iteritems()) \ + p = {self.key_to_es(k): v.to_es() \ + for k, v in sorted(self.definition.iteritems()) \ if not v.exclude_elasticsearch} return {"properties": p} + def _docs_common(self, parent_category): + category = self.category or parent_category + retv = { + "category": category, + "doc": self.doc, + "type": self.__class__.__name__, + "required": self.required, + } + return retv + + def docs_es(self, parent_category=None): + retv = self._docs_common(parent_category=parent_category) + retv["fields"] = { self.key_to_es(k): v.docs_es() \ + for k, v in sorted(self.definition.iteritems()) \ + if not v.exclude_elasticsearch } + return retv + def to_dict(self): source = sorted(self.definition.iteritems()) p = {self.key_to_es(k): v.to_dict() for k, v in source} @@ -152,8 +203,8 @@ def validate(self, name, value): class NestedListOf(ListOf): - def __init__(self, object_, subrecord_name, max_items=10): - ListOf.__init__(self, object_, max_items) + def __init__(self, object_, subrecord_name, max_items=10, doc=None, category=None): + ListOf.__init__(self, object_, max_items, doc=doc, category=category) self.subrecord_name = subrecord_name def to_bigquery(self, name): @@ -162,6 +213,19 @@ def to_bigquery(self, name): }) retv = subr.to_bigquery(self.key_to_bq(name)) retv["mode"] = "REPEATED" + if self.doc: + retv["doc"] = self.doc + return retv + + def docs_bq(self, parent_category=None): + subr = SubRecord({ + self.subrecord_name: ListOf(self.object_) + }) + category = self.category or parent_category + retv = subr.docs_bq(parent_category=category) + retv["repeated"] = True + if self.doc: + retv["doc"] = self.doc return retv @@ -170,24 +234,25 @@ class Record(SubRecord): def to_es(self, name): return {name:SubRecord.to_es(self)} + def docs_es(self, name, parent_category=None): + category = self.category or parent_category + return {name: SubRecord.docs_es(self, parent_category=category)} + def to_bigquery(self): source = sorted(self.definition.iteritems()) - return [s.to_bigquery(name) for (name, s) in source \ - if not s.exclude_bigquery] - - def to_html(self): - pass + return [s.to_bigquery(name) \ + for (name, s) in source \ + if not s.exclude_bigquery + ] - def to_documented_html(self): - pass + def docs_bq(self, name, parent_category=None): + category = self.category or parent_category + return {name: SubRecord.docs_bq(self, parent_category=category)} def print_indent_string(self): for name, field in sorted(self.definition.iteritems()): field.print_indent_string(name, 0) - def to_dotted_text(self): - pass - def validate(self, value): if type(value) != dict: raise DataValidationException("record is not a dict", str(value)) @@ -212,5 +277,3 @@ def to_flat(self): @classmethod def from_json(cls, j): return cls({(k, __encode(v)) for k, v in sorted(j.iteritems())}) - - diff --git a/zschema/leaves.py b/zschema/leaves.py index f5829ce..b079630 100644 --- a/zschema/leaves.py +++ b/zschema/leaves.py @@ -15,12 +15,11 @@ def __init__(self, es_index=None, es_analyzer=None, doc=None, + examples=None, es_include_raw=None, deprecated=False, ignore=False, - autocomplete_include=True, - autocomplete_category=None, - autocomplete_icon=None, + category=None, exclude=None, metadata=None, units=None, @@ -30,6 +29,7 @@ def __init__(self, self.es_index = es_index self.es_analyzer = es_analyzer self.doc = doc + self.examples = examples if examples else [] if es_include_raw is not None: self.es_include_raw = es_include_raw else: @@ -40,9 +40,7 @@ def __init__(self, e = "WARN: %s is deprecated and will be removed in a "\ "future release\n" % self.__class__.__name__ sys.stderr.write(e) - self.autocomplete_category = autocomplete_category - self.autocomplete_category = autocomplete_category - self.autocomplete_icon = autocomplete_icon + self.category = category self._exclude = set(exclude) if exclude else set([]) self.metadata = metadata if metadata else {} self.units = units @@ -56,7 +54,8 @@ def to_dict(self): "type":self.__class__.__name__, "es_type":self.ES_TYPE, "bq_type":self.BQ_TYPE, - "metadata":self.metadata + "metadata":self.metadata, + "examples": self.examples, } if self.units is not None: retv["units"] = self.units @@ -72,13 +71,36 @@ def to_es(self): self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER") self.add_es_var(retv, "search_analyzer", "es_search_analyzer", "ES_SEARCH_ANALYZER") - if self.es_include_raw: retv["fields"] = { "raw":{"type":"keyword"} } return retv + def _docs_common(self, parent_category): + retv = { + "detail_type": self.__class__.__name__, + "category": self.category or parent_category, + "doc": self.doc, + "required": self.required, + } + if hasattr(self, "values_s") and len(self.values_s): + retv["values"] = list(self.values_s) + else: + retv["examples"] = self.examples + return retv + + def docs_es(self, parent_category=None): + retv = self._docs_common(parent_category) + self.add_es_var(retv, "analyzer", "es_analyzer", "ES_ANALYZER") + retv["type"] = self.ES_TYPE + return retv + + def docs_bq(self, parent_category=None): + retv = self._docs_common(parent_category) + retv["type"] = self.BQ_TYPE + return retv + def to_bigquery(self, name): if not self._check_valid_name(name): raise Exception("Invalid field name: %s" % name) @@ -118,9 +140,6 @@ def to_flat(self, parent, name, repeated=False): "mode":mode } - def to_autocomplete(self, parent, name, repated=False): - pass - def print_indent_string(self, name, indent): val = self.key_to_string(name) if indent: diff --git a/zschema/tests.py b/zschema/tests.py index 85855d6..b222f7b 100644 --- a/zschema/tests.py +++ b/zschema/tests.py @@ -67,6 +67,178 @@ def test_invalid(self): } } +VALID_DOCS_OUTPUT_FOR_ES_FIELDS = { + "host": { + "category": None, + "doc": None, + "fields": { + "443": { + "category": "heartbleed", + "doc": None, + "fields": { + "heartbleed": { + "category": None, + "doc": None, + "fields": { + "heartbeat_support": { + "category": None, + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "boolean" + }, + "heartbleed_vulnerable": { + "category": "Vulnerabilities", + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "boolean" + }, + "timestamp": { + "category": None, + "detail_type": "DateTime", + "doc": None, + "examples": [], + "required": False, + "type": "date" + } + }, + "required": False, + "type": "SubRecord" + }, + "tls": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "required": False, + "type": "keyword" + } + }, + "required": False, + "type": "SubRecord" + }, + "ip": { + "category": None, + "detail_type": "Long", + "doc": "The IP Address of the host", + "examples": [], + "required": False, + "type": "long" + }, + "ipstr": { + "category": None, + "detail_type": "IPv4Address", + "doc": None, + "examples": [ + "8.8.8.8" + ], + "required": True, + "type": "ip" + }, + "tags": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "repeated": True, + "required": False, + "type": "keyword" + } + }, + "required": False, + "type": "Record" + } +} + +VALID_DOCS_OUTPUT_FOR_BIG_QUERY_FIELDS = { + "host": { + "category": None, + "doc": None, + "fields": { + "ip": { + "category": None, + "detail_type": "Long", + "doc": "The IP Address of the host", + "examples": [], + "required": False, + "type": "INTEGER" + }, + "ipstr": { + "category": None, + "detail_type": "IPv4Address", + "doc": None, + "examples": [ + "8.8.8.8" + ], + "required": True, + "type": "STRING" + }, + "p443": { + "category": "heartbleed", + "doc": None, + "fields": { + "heartbleed": { + "category": None, + "doc": None, + "fields": { + "heartbeat_support": { + "category": None, + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "BOOLEAN" + }, + "heartbleed_vulnerable": { + "category": "Vulnerabilities", + "detail_type": "Boolean", + "doc": None, + "examples": [], + "required": False, + "type": "BOOLEAN" + }, + "timestamp": { + "category": None, + "detail_type": "DateTime", + "doc": None, + "examples": [], + "required": False, + "type": "DATETIME" + } + }, + "required": False, + "type": "SubRecord" + }, + "tls": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "required": False, + "type": "STRING" + } + }, + "required": False, + "type": "SubRecord" + }, + "tags": { + "category": None, + "detail_type": "String", + "doc": None, + "examples": [], + "repeated": True, + "required": False, + "type": "STRING" + } + }, + "required": False, + "type": "Record" + } +} + VALID_BIG_QUERY = [ { "fields": [ @@ -115,6 +287,7 @@ def test_invalid(self): { "type": "INTEGER", "name": "ip", + "doc": "The IP Address of the host", "mode": "NULLABLE" }, ] @@ -154,16 +327,16 @@ def setUp(self): heartbleed = SubRecord({ "heartbeat_support":Boolean(), - "heartbleed_vulnerable":Boolean(), + "heartbleed_vulnerable":Boolean(category="Vulnerabilities"), "timestamp":DateTime() }) self.host = Record({ - "ipstr":IPv4Address(required=True), - "ip":Long(), + "ipstr":IPv4Address(required=True, examples=["8.8.8.8"]), + "ip":Long(doc="The IP Address of the host"), Port(443):SubRecord({ "tls":String(), "heartbleed":heartbleed - }), + }, category="heartbleed"), "tags":ListOf(String()) }) @@ -177,6 +350,15 @@ def test_elasticsearch(self): r = self.host.to_es("host") self.assertEqual(r, VALID_ELASTIC_SEARCH) + def test_docs_output(self): + global VALID_DOCS_OUTPUT_FOR_ES_FIELDS + r = self.host.docs_es("host") + self.assertEqual(r, VALID_DOCS_OUTPUT_FOR_ES_FIELDS) + + global VALID_DOCS_OUTPUT_FOR_BIG_QUERY_FIELDS + r = self.host.docs_bq("host") + self.assertEqual(r, VALID_DOCS_OUTPUT_FOR_BIG_QUERY_FIELDS) + def test_validation_known_good(self): test = { "ipstr":"141.212.120.1",