add yaml include tag

monarch-initiative · Oct 11, 2021 · 06848eb · 06848eb
1 parent 750c247
commit 06848eb
Show file tree

Hide file tree

Showing 16 changed files with 117 additions and 29 deletions.
diff --git a/docs/ingest_configuration.md b/docs/ingest_configuration.md
@@ -54,11 +54,6 @@ header: 0
 # Boolean to skip blank lines, default is true
 skip_blank_lines: True
 
-# Set pre-defined source_file properties (like column lists) for common file formats. 
-# Options: 'gpi' and 'oban'
-# Additional standard formats can be added in source_config.py. 
-standard_format: 'gpi'
-
 # include a map file
 depends_on:
   - './examples/maps/alliance-gene.yaml'

diff --git a/examples/standards/gpi.yaml b/examples/standards/gpi.yaml
@@ -0,0 +1,10 @@
+- "DB"
+- "DB_Object_ID"
+- "DB_Object_Symbol"
+- "DB_Object_Name"
+- "DB_Object_Synonym(s)"
+- "DB_Object_Type"
+- "Taxon"
+- "Parent_Object_ID"
+- "DB_Xref(s)"
+- "Properties"
diff --git a/examples/standards/oban.yaml b/examples/standards/oban.yaml
@@ -0,0 +1,13 @@
+- "SUBJECT"
+- "SUBJECT_LABEL"
+- "SUBJECT_TAXON"
+- "SUBJECT_TAXON_LABEL"
+- "OBJECT"
+- "OBJECT_LABEL"
+- "RELATION"
+- "RELATION_LABEL"
+- "EVIDENCE"
+- "EVIDENCE_LABEL"
+- "SOURCE"
+- "IS_DEFINED_BY"
+- "QUALIFIER"
diff --git a/examples/standards/string.yaml b/examples/standards/string.yaml
@@ -0,0 +1,10 @@
+- 'protein1'
+- 'protein2'
+- 'neighborhood'
+- 'fusion'
+- 'cooccurence'
+- 'coexpression'
+- 'experimental'
+- 'database'
+- 'textmining'
+- 'combined_score' : 'int'
diff --git a/examples/string-w-custom-map/protein-links-detailed.yaml b/examples/string-w-custom-map/protein-links-detailed.yaml
@@ -6,7 +6,7 @@ files:
   - './examples/data/string.tsv'
   - './examples/data/string2.tsv'
 
-metadata: './examples/string-w-custom-map/metadata.yaml'
+metadata: !include './examples/string-w-custom-map/metadata.yaml'
 
 columns:
   - 'protein1'

diff --git a/examples/string-w-map/protein-links-detailed.yaml b/examples/string-w-map/protein-links-detailed.yaml
@@ -6,7 +6,7 @@ files:
   - './examples/data/string.tsv'
   - './examples/data/string2.tsv'
 
-metadata: './examples/string-w-map/metadata.yaml'
+metadata: !include './examples/string-w-map/metadata.yaml'
 
 columns:
   - 'protein1'

diff --git a/examples/string/protein-links-detailed.yaml b/examples/string/protein-links-detailed.yaml
@@ -6,19 +6,9 @@ files:
   - './examples/data/string.tsv'
   - './examples/data/string2.tsv'
 
-metadata: './examples/string/metadata.yaml'
-
-columns:
-  - 'protein1'
-  - 'protein2'
-  - 'neighborhood'
-  - 'fusion'
-  - 'cooccurence'
-  - 'coexpression'
-  - 'experimental'
-  - 'database'
-  - 'textmining'
-  - 'combined_score' : 'int'
+metadata: !include './examples/string/metadata.yaml'
+
+columns: !include './examples/standards/string.yaml'
 
 filters:
   - inclusion: 'include'

diff --git a/examples/xenbase/gene-information.yaml b/examples/xenbase/gene-information.yaml
@@ -3,7 +3,8 @@ name: 'gene-information'
 files:
   - './examples/data/xenbase.gpi.gz'
 
-standard_format: 'gpi'
+# standard_format: 'gpi'
+columns: !include './examples/standards/gpi.yaml'
 
 compression: 'gzip'
 

diff --git a/examples/xenbase/gene-literature.yaml b/examples/xenbase/gene-literature.yaml
@@ -2,7 +2,7 @@ name: 'gene-literature'
 
 delimiter: '\t'
 
-metadata: './examples/xenbase/metadata.yaml'
+metadata: !include './examples/xenbase/metadata.yaml'
 
 files:
   - './examples/data/LiteratureMatchedGenesByPaper.txt'

diff --git a/examples/xenbase/gene-to-phenotype.yaml b/examples/xenbase/gene-to-phenotype.yaml
@@ -3,9 +3,9 @@ name: 'gene-to-phenotype'
 files:
   - './examples/data/xb_xpo_spo_v20210511b.csv'
 
-metadata: './examples/xenbase/metadata.yaml'
+metadata: !include './examples/xenbase/metadata.yaml'
 
-standard_format: 'oban'
+columns: !include './examples/standards/oban.yaml'
 
 node_properties:
   - 'id'

diff --git a/koza/app.py b/koza/app.py
@@ -11,6 +11,7 @@
 from koza.io.writer.jsonl_writer import JSONLWriter
 from koza.io.writer.tsv_writer import TSVWriter
 from koza.io.writer.writer import KozaWriter
+from koza.io.yaml_loader import UniqueIncludeLoader
 from koza.model.config.source_config import MapFileConfig, OutputFormat
 from koza.model.curie_cleaner import CurieCleaner
 from koza.model.map_dict import MapDict
@@ -49,7 +50,7 @@ def __init__(
         if source.config.depends_on is not None:
             for map_file in source.config.depends_on:
                 with open(map_file, 'r') as map_file_fh:
-                    map_file_config = MapFileConfig(**yaml.safe_load(map_file_fh))
+                    map_file_config = MapFileConfig(**yaml.load(map_file_fh, Loader=UniqueIncludeLoader))
                     map_file_config.transform_code = (
                         str(Path(map_file).parent / Path(map_file).stem) + '.py'
                     )
@@ -180,3 +181,19 @@ def _load_map(self, map_file_config: MapFileConfig):
                 map[row[key_column]] = {
                     key: value for key, value in row.items() if key in value_columns
                 }
+
+    @staticmethod
+    def _map_sniffer(depends_on: str):
+        """
+        TODO a utility function to determine if a depends_on string
+        is a path to a map config file, a yaml file that should be
+        interpreted as a dictionary, or a json file that should be
+        interpreted as a dictionary
+
+        See https://github.com/monarch-initiative/koza/issues/39
+
+        :param depends_on:
+        :return:
+        """
+        pass
+
diff --git a/koza/cli_runner.py b/koza/cli_runner.py
@@ -13,6 +13,7 @@
 from koza.io.reader.json_reader import JSONReader
 from koza.io.reader.jsonl_reader import JSONLReader
 from koza.io.utils import open_resource
+from koza.io.yaml_loader import UniqueIncludeLoader
 from koza.model.config.source_config import (
     CompressionType,
     FormatType,
@@ -132,7 +133,7 @@ def transform_source(
     translation_table = get_translation_table(global_table, local_table)
 
     with open(source, 'r') as source_fh:
-        source_config = PrimaryFileConfig(**yaml.safe_load(source_fh))
+        source_config = PrimaryFileConfig(**yaml.load(source_fh, Loader=UniqueIncludeLoader))
         if not source_config.name:
             source_config.name = Path(source).stem
 

diff --git a/koza/io/utils.py b/koza/io/utils.py
@@ -6,7 +6,7 @@
 import tempfile
 from io import TextIOWrapper
 from os import PathLike
-from pathlib import Path, PosixPath
+from pathlib import Path
 from typing import IO, Union
 
 import requests

diff --git a/koza/io/yaml_loader.py b/koza/io/yaml_loader.py
@@ -0,0 +1,47 @@
+"""
+Custom PyYaml loaders to add support for
+unique key checking and including/importing other yaml files via
+an 'include!' tag, eg
+
+x: !include some-other.yaml
+y: 1
+
+Unique key loader based on: https://stackoverflow.com/a/63215043
+Include loader based on: https://matthewpburruss.com/post/yaml/
+"""
+
+import yaml
+from yaml import SafeLoader
+from yaml.constructor import ConstructorError
+from typing import Union, IO
+
+from koza.io.utils import open_resource
+
+
+class UniqueIncludeLoader(SafeLoader):
+    """
+    YAML Loader with additional support for
+    - checking for duplicate keys
+    - an '!include' tag for importing other yaml files
+    """
+
+    def unique_construct_mapping(self, node: yaml.Node, deep=False):
+        mapping = []
+        for key_node, value_node in node.value:
+            key = self.construct_object(key_node, deep=deep)
+            if key in mapping:
+                raise ConstructorError(
+                    f"while constructing a mapping for {value_node.value} "
+                    f"found duplicate key {key}"
+                )
+            mapping.append(key)
+        return super().construct_mapping(node, deep)
+
+    def include_constructor(self, node: yaml.Node) -> Union[str, IO[str]]:
+        """
+        Opens some resource (local or remote file) that appears after an !include tag
+        """
+        return yaml.load(open_resource(self.construct_scalar(node)), Loader=UniqueIncludeLoader)
+
+
+yaml.add_constructor('!include', UniqueIncludeLoader.include_constructor, UniqueIncludeLoader)
diff --git a/koza/model/config/source_config.py b/koza/model/config/source_config.py
@@ -185,7 +185,7 @@ class SourceConfig:
     filters: List[ColumnFilter] = field(default_factory=list)
     json_path: List[Union[StrictStr, StrictInt]] = None
     transform_code: str = None
-    transform_mode: TransformMode = TransformMode.loop
+    transform_mode: TransformMode = TransformMode.flat
 
     def __post_init_post_parse__(self):
         """
@@ -202,6 +202,8 @@ def __post_init_post_parse__(self):
 
         if self.metadata and isinstance(self.metadata, str):
             # If this looks like a file path attempt to load it from the yaml
+            # TODO enforce that this is imported via an include?
+            # See https://github.com/monarch-initiative/koza/issues/46
             try:
                 object.__setattr__(
                     self, 'metadata', DatasetDescription(**yaml.safe_load(self.metadata))
@@ -211,6 +213,7 @@ def __post_init_post_parse__(self):
                 LOG.warning("Could not load dataset description from metadata file")
 
         # todo: where should this really be stored? defaults for a format should probably be defined in yaml
+        # We will replace this with https://github.com/monarch-initiative/koza/issues/46
         if self.standard_format == StandardFormat.gpi:
             self.format = FormatType.csv
             self.delimiter = "\t"

diff --git a/koza/model/source.py b/koza/model/source.py
@@ -5,6 +5,7 @@
 from koza.io.reader.csv_reader import CSVReader
 from koza.io.reader.json_reader import JSONReader
 from koza.io.reader.jsonl_reader import JSONLReader
+from koza.io.yaml_loader import UniqueIncludeLoader
 from koza.io.utils import open_resource
 from koza.model.config.source_config import MapFileConfig, PrimaryFileConfig, SourceConfig
 from koza.row_filter import RowFilter
@@ -36,7 +37,7 @@ def __init__(
         if not isinstance(config, SourceConfig):
             # Check to see if it's a file path
             with open(config, 'r') as source_file_fh:
-                self.config = PrimaryFileConfig(**yaml.safe_load(source_file_fh))
+                self.config = PrimaryFileConfig(**yaml.load(source_file_fh, Loader=UniqueIncludeLoader))
         else:
             # TODO better error handling
             self.config = config