Skip to content

Commit

Permalink
add yaml include tag
Browse files Browse the repository at this point in the history
  • Loading branch information
kshefchek committed Oct 11, 2021
1 parent 750c247 commit 06848eb
Show file tree
Hide file tree
Showing 16 changed files with 117 additions and 29 deletions.
5 changes: 0 additions & 5 deletions docs/ingest_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,6 @@ header: 0
# Boolean to skip blank lines, default is true
skip_blank_lines: True

# Set pre-defined source_file properties (like column lists) for common file formats.
# Options: 'gpi' and 'oban'
# Additional standard formats can be added in source_config.py.
standard_format: 'gpi'

# include a map file
depends_on:
- './examples/maps/alliance-gene.yaml'
Expand Down
10 changes: 10 additions & 0 deletions examples/standards/gpi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
- "DB"
- "DB_Object_ID"
- "DB_Object_Symbol"
- "DB_Object_Name"
- "DB_Object_Synonym(s)"
- "DB_Object_Type"
- "Taxon"
- "Parent_Object_ID"
- "DB_Xref(s)"
- "Properties"
13 changes: 13 additions & 0 deletions examples/standards/oban.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
- "SUBJECT"
- "SUBJECT_LABEL"
- "SUBJECT_TAXON"
- "SUBJECT_TAXON_LABEL"
- "OBJECT"
- "OBJECT_LABEL"
- "RELATION"
- "RELATION_LABEL"
- "EVIDENCE"
- "EVIDENCE_LABEL"
- "SOURCE"
- "IS_DEFINED_BY"
- "QUALIFIER"
10 changes: 10 additions & 0 deletions examples/standards/string.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
- 'protein1'
- 'protein2'
- 'neighborhood'
- 'fusion'
- 'cooccurence'
- 'coexpression'
- 'experimental'
- 'database'
- 'textmining'
- 'combined_score' : 'int'
2 changes: 1 addition & 1 deletion examples/string-w-custom-map/protein-links-detailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ files:
- './examples/data/string.tsv'
- './examples/data/string2.tsv'

metadata: './examples/string-w-custom-map/metadata.yaml'
metadata: !include './examples/string-w-custom-map/metadata.yaml'

columns:
- 'protein1'
Expand Down
2 changes: 1 addition & 1 deletion examples/string-w-map/protein-links-detailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ files:
- './examples/data/string.tsv'
- './examples/data/string2.tsv'

metadata: './examples/string-w-map/metadata.yaml'
metadata: !include './examples/string-w-map/metadata.yaml'

columns:
- 'protein1'
Expand Down
16 changes: 3 additions & 13 deletions examples/string/protein-links-detailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,9 @@ files:
- './examples/data/string.tsv'
- './examples/data/string2.tsv'

metadata: './examples/string/metadata.yaml'

columns:
- 'protein1'
- 'protein2'
- 'neighborhood'
- 'fusion'
- 'cooccurence'
- 'coexpression'
- 'experimental'
- 'database'
- 'textmining'
- 'combined_score' : 'int'
metadata: !include './examples/string/metadata.yaml'

columns: !include './examples/standards/string.yaml'

filters:
- inclusion: 'include'
Expand Down
3 changes: 2 additions & 1 deletion examples/xenbase/gene-information.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ name: 'gene-information'
files:
- './examples/data/xenbase.gpi.gz'

standard_format: 'gpi'
# standard_format: 'gpi'
columns: !include './examples/standards/gpi.yaml'

compression: 'gzip'

Expand Down
2 changes: 1 addition & 1 deletion examples/xenbase/gene-literature.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: 'gene-literature'

delimiter: '\t'

metadata: './examples/xenbase/metadata.yaml'
metadata: !include './examples/xenbase/metadata.yaml'

files:
- './examples/data/LiteratureMatchedGenesByPaper.txt'
Expand Down
4 changes: 2 additions & 2 deletions examples/xenbase/gene-to-phenotype.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ name: 'gene-to-phenotype'
files:
- './examples/data/xb_xpo_spo_v20210511b.csv'

metadata: './examples/xenbase/metadata.yaml'
metadata: !include './examples/xenbase/metadata.yaml'

standard_format: 'oban'
columns: !include './examples/standards/oban.yaml'

node_properties:
- 'id'
Expand Down
19 changes: 18 additions & 1 deletion koza/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from koza.io.writer.jsonl_writer import JSONLWriter
from koza.io.writer.tsv_writer import TSVWriter
from koza.io.writer.writer import KozaWriter
from koza.io.yaml_loader import UniqueIncludeLoader
from koza.model.config.source_config import MapFileConfig, OutputFormat
from koza.model.curie_cleaner import CurieCleaner
from koza.model.map_dict import MapDict
Expand Down Expand Up @@ -49,7 +50,7 @@ def __init__(
if source.config.depends_on is not None:
for map_file in source.config.depends_on:
with open(map_file, 'r') as map_file_fh:
map_file_config = MapFileConfig(**yaml.safe_load(map_file_fh))
map_file_config = MapFileConfig(**yaml.load(map_file_fh, Loader=UniqueIncludeLoader))
map_file_config.transform_code = (
str(Path(map_file).parent / Path(map_file).stem) + '.py'
)
Expand Down Expand Up @@ -180,3 +181,19 @@ def _load_map(self, map_file_config: MapFileConfig):
map[row[key_column]] = {
key: value for key, value in row.items() if key in value_columns
}

@staticmethod
def _map_sniffer(depends_on: str):
"""
TODO a utility function to determine if a depends_on string
is a path to a map config file, a yaml file that should be
interpreted as a dictionary, or a json file that should be
interpreted as a dictionary
See https://github.com/monarch-initiative/koza/issues/39
:param depends_on:
:return:
"""
pass

3 changes: 2 additions & 1 deletion koza/cli_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from koza.io.reader.json_reader import JSONReader
from koza.io.reader.jsonl_reader import JSONLReader
from koza.io.utils import open_resource
from koza.io.yaml_loader import UniqueIncludeLoader
from koza.model.config.source_config import (
CompressionType,
FormatType,
Expand Down Expand Up @@ -132,7 +133,7 @@ def transform_source(
translation_table = get_translation_table(global_table, local_table)

with open(source, 'r') as source_fh:
source_config = PrimaryFileConfig(**yaml.safe_load(source_fh))
source_config = PrimaryFileConfig(**yaml.load(source_fh, Loader=UniqueIncludeLoader))
if not source_config.name:
source_config.name = Path(source).stem

Expand Down
2 changes: 1 addition & 1 deletion koza/io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import tempfile
from io import TextIOWrapper
from os import PathLike
from pathlib import Path, PosixPath
from pathlib import Path
from typing import IO, Union

import requests
Expand Down
47 changes: 47 additions & 0 deletions koza/io/yaml_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Custom PyYaml loaders to add support for
unique key checking and including/importing other yaml files via
an 'include!' tag, eg
x: !include some-other.yaml
y: 1
Unique key loader based on: https://stackoverflow.com/a/63215043
Include loader based on: https://matthewpburruss.com/post/yaml/
"""

import yaml
from yaml import SafeLoader
from yaml.constructor import ConstructorError
from typing import Union, IO

from koza.io.utils import open_resource


class UniqueIncludeLoader(SafeLoader):
"""
YAML Loader with additional support for
- checking for duplicate keys
- an '!include' tag for importing other yaml files
"""

def unique_construct_mapping(self, node: yaml.Node, deep=False):
mapping = []
for key_node, value_node in node.value:
key = self.construct_object(key_node, deep=deep)
if key in mapping:
raise ConstructorError(
f"while constructing a mapping for {value_node.value} "
f"found duplicate key {key}"
)
mapping.append(key)
return super().construct_mapping(node, deep)

def include_constructor(self, node: yaml.Node) -> Union[str, IO[str]]:
"""
Opens some resource (local or remote file) that appears after an !include tag
"""
return yaml.load(open_resource(self.construct_scalar(node)), Loader=UniqueIncludeLoader)


yaml.add_constructor('!include', UniqueIncludeLoader.include_constructor, UniqueIncludeLoader)
5 changes: 4 additions & 1 deletion koza/model/config/source_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class SourceConfig:
filters: List[ColumnFilter] = field(default_factory=list)
json_path: List[Union[StrictStr, StrictInt]] = None
transform_code: str = None
transform_mode: TransformMode = TransformMode.loop
transform_mode: TransformMode = TransformMode.flat

def __post_init_post_parse__(self):
"""
Expand All @@ -202,6 +202,8 @@ def __post_init_post_parse__(self):

if self.metadata and isinstance(self.metadata, str):
# If this looks like a file path attempt to load it from the yaml
# TODO enforce that this is imported via an include?
# See https://github.com/monarch-initiative/koza/issues/46
try:
object.__setattr__(
self, 'metadata', DatasetDescription(**yaml.safe_load(self.metadata))
Expand All @@ -211,6 +213,7 @@ def __post_init_post_parse__(self):
LOG.warning("Could not load dataset description from metadata file")

# todo: where should this really be stored? defaults for a format should probably be defined in yaml
# We will replace this with https://github.com/monarch-initiative/koza/issues/46
if self.standard_format == StandardFormat.gpi:
self.format = FormatType.csv
self.delimiter = "\t"
Expand Down
3 changes: 2 additions & 1 deletion koza/model/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from koza.io.reader.csv_reader import CSVReader
from koza.io.reader.json_reader import JSONReader
from koza.io.reader.jsonl_reader import JSONLReader
from koza.io.yaml_loader import UniqueIncludeLoader
from koza.io.utils import open_resource
from koza.model.config.source_config import MapFileConfig, PrimaryFileConfig, SourceConfig
from koza.row_filter import RowFilter
Expand Down Expand Up @@ -36,7 +37,7 @@ def __init__(
if not isinstance(config, SourceConfig):
# Check to see if it's a file path
with open(config, 'r') as source_file_fh:
self.config = PrimaryFileConfig(**yaml.safe_load(source_file_fh))
self.config = PrimaryFileConfig(**yaml.load(source_file_fh, Loader=UniqueIncludeLoader))
else:
# TODO better error handling
self.config = config
Expand Down

0 comments on commit 06848eb

Please sign in to comment.