Merge branch 'main' into bump-biolink-model-pydantic

monarch-initiative · Oct 23, 2021 · 4001285 · 4001285
2 parents 3c9c745 + 8e6c9a0
commit 4001285
Show file tree

Hide file tree

Showing 29 changed files with 329 additions and 130 deletions.
diff --git a/.github/workflows/test.yml → .github/workflows/build.yml b/.github/workflows/test.yml → .github/workflows/build.yml
@@ -1,6 +1,6 @@
 # Builds and runs pytest on ubuntu-latest
 # Tests python versions >=3.6
-name: run tests
+name: build
 
 on:
   push:

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -0,0 +1,17 @@
+name: deploy documentation
+on:
+  push:
+    branches:
+      - main
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.x
+      - run: pip install mkdocs
+      - run: mkdocs gh-deploy --force
+env:
+  GH_TOKEN: ${{ secrets.GH_TOKEN }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -18,7 +18,7 @@ jobs:
 
             - name: Install dependencies
               run: |
-                  make install
+                  make
             - name: Publish to PyPi
               env:
                   FLIT_USERNAME: ${{ secrets.PYPI_USERNAME }}

diff --git a/README.md b/README.md
@@ -1,3 +1,7 @@
+[![Pyversions](https://img.shields.io/pypi/pyversions/koza.svg)](https://pypi.python.org/pypi/koza)
+![](https://github.com/monarch-initiative/koza/actions/workflows/build.yml/badge.svg)
+[![PyPi](https://img.shields.io/pypi/v/koza.svg)](https://pypi.python.org/pypi/koza)
+
 ### Koza
 
 ![pupa](docs/img/pupa.png) Data transformation framework
@@ -30,7 +34,7 @@ Send a local or remove csv file through Koza to get some basic information (head
 
 ```bash
 koza validate \
-  --file https://raw.githubusercontent.com/monarch-initiative/koza/dev/tests/resources/source-files/string.tsv \
+  --file https://raw.githubusercontent.com/monarch-initiative/koza/main/examples/data/string.tsv \
   --delimiter ' '
 ```
 
@@ -51,7 +55,7 @@ koza validate \
 ###### Example: transforming StringDB
 
 ```bash
-koza transform --source examples/string/metadata.yaml 
+koza transform --source examples/string/protein-links-detailed.yaml --global-table examples/translation_table.yaml 
 
-koza transform --source examples/string-declarative/metadata.yaml 
+koza transform --source examples/string-declarative/protein-links-detailed.yaml --global-table examples/translation_table.yaml
 ```
diff --git a/docs/CNAME b/docs/CNAME
@@ -0,0 +1 @@
+koza.monarchinitiative.org
diff --git a/docs/index.md b/docs/index.md
@@ -60,3 +60,13 @@ koza transform --source examples/string/metadata.yaml
 
 koza transform --source examples/string-declarative/metadata.yaml 
 ```
+#### Running an ingest from within a python script
+
+Executing a koza transform from within a python script can be done by calling transform_source from koza.cli_runner
+
+```python
+from koza.cli_runner import transform_source
+
+transform_source("./examples/string/protein-links-detailed.yaml",
+                 "output", "tsv", "./examples/translation_table.yaml", None)
+```
diff --git a/docs/ingest_configuration.md b/docs/ingest_configuration.md
@@ -6,6 +6,8 @@ Ingests are configured via a single source file yaml, and optional mapping file
 
 This YAML file sets properties for the ingest of a single file type from a within a Source.
 
+Tip: relative paths are relative to the directory where you execute Koza.
+
 ```yaml
 name: 'name-of-ingest'
 
@@ -41,20 +43,19 @@ delimiter: '\t'
 # Optional delimiter for header row
 header_delimiter: '|' 
 
-# Boolean to configure presence of header, default is true
-has_header: 'False'
-
-# Number of lines to be ignored at the head of an ingest data file, default is 0
-skip_lines: 10 
+# Optional, int | 'infer' | 'none', Default = 'infer'
+# The index (0 based) in which the header appears in the file.
+#
+# If header is set to 'infer' the headers will be set to the first
+# line that is not blank or commented with a hash.
+#
+# If header is set to 'none' then the columns field will be used,
+# or raise a ValueError if columns are not supplied
+header: 0
 
 # Boolean to skip blank lines, default is true
 skip_blank_lines: True
 
-# Set pre-defined source_file properties (like column lists) for common file formats. 
-# Options: 'gpi' and 'oban'
-# Additional standard formats can be added in source_config.py. 
-standard_format: 'gpi'
-
 # include a map file
 depends_on:
   - './examples/maps/alliance-gene.yaml'
@@ -110,3 +111,24 @@ values:
   - value1
   - value2
 ```
+
+### Composing Configuration from Multiple Yaml Files
+
+The Koza yaml loader supports importing/including other yaml files via an !include tag.
+To reuse fields that appear in multiple ingests, such as metadata and columns:
+
+```yaml
+metadata: !include './path/to/metadata.yaml'
+columns: !include './path/to/standard-columns.yaml'
+```
+
+For example, a standard column file will be formatted as a yaml list, i.e. the parent key is omitted:
+
+```yaml
+- 'column_1'
+- 'column_2'
+- 'column_3'
+- 'column_4' : 'int'
+```
+
+Tip: '!include' tags must be values in a yaml file
diff --git a/examples/maps/entrez-2-string.yaml b/examples/maps/entrez-2-string.yaml
@@ -5,6 +5,7 @@ metadata:
 
 delimiter: '\t'
 header_delimiter: '/'
+header: 0
 
 # Assumes that no identifiers are overlapping
 # otherwise these should go into separate configs

diff --git a/examples/maps/genepage-2-gene.yaml b/examples/maps/genepage-2-gene.yaml
@@ -4,8 +4,6 @@ metadata:
 
 delimiter: '\t'
 
-has_header: 'False'
-
 files:
   - './examples/data/XenbaseGenepageToGeneIdMapping.txt'
 

diff --git a/examples/standards/gpi.yaml b/examples/standards/gpi.yaml
@@ -0,0 +1,10 @@
+- "DB"
+- "DB_Object_ID"
+- "DB_Object_Symbol"
+- "DB_Object_Name"
+- "DB_Object_Synonym(s)"
+- "DB_Object_Type"
+- "Taxon"
+- "Parent_Object_ID"
+- "DB_Xref(s)"
+- "Properties"
diff --git a/examples/standards/oban.yaml b/examples/standards/oban.yaml
@@ -0,0 +1,13 @@
+- "SUBJECT"
+- "SUBJECT_LABEL"
+- "SUBJECT_TAXON"
+- "SUBJECT_TAXON_LABEL"
+- "OBJECT"
+- "OBJECT_LABEL"
+- "RELATION"
+- "RELATION_LABEL"
+- "EVIDENCE"
+- "EVIDENCE_LABEL"
+- "SOURCE"
+- "IS_DEFINED_BY"
+- "QUALIFIER"
diff --git a/examples/standards/string.yaml b/examples/standards/string.yaml
@@ -0,0 +1,10 @@
+- 'protein1'
+- 'protein2'
+- 'neighborhood'
+- 'fusion'
+- 'cooccurence'
+- 'coexpression'
+- 'experimental'
+- 'database'
+- 'textmining'
+- 'combined_score' : 'int'
diff --git a/examples/string-w-custom-map/protein-links-detailed.yaml b/examples/string-w-custom-map/protein-links-detailed.yaml
@@ -6,7 +6,7 @@ files:
   - './examples/data/string.tsv'
   - './examples/data/string2.tsv'
 
-metadata: './examples/string-w-custom-map/metadata.yaml'
+metadata: !include './examples/string-w-custom-map/metadata.yaml'
 
 columns:
   - 'protein1'

diff --git a/examples/string-w-map/protein-links-detailed.yaml b/examples/string-w-map/protein-links-detailed.yaml
@@ -6,7 +6,7 @@ files:
   - './examples/data/string.tsv'
   - './examples/data/string2.tsv'
 
-metadata: './examples/string-w-map/metadata.yaml'
+metadata: !include './examples/string-w-map/metadata.yaml'
 
 columns:
   - 'protein1'

diff --git a/examples/string/protein-links-detailed.yaml b/examples/string/protein-links-detailed.yaml
@@ -6,19 +6,9 @@ files:
   - './examples/data/string.tsv'
   - './examples/data/string2.tsv'
 
-metadata: './examples/string/metadata.yaml'
-
-columns:
-  - 'protein1'
-  - 'protein2'
-  - 'neighborhood'
-  - 'fusion'
-  - 'cooccurence'
-  - 'coexpression'
-  - 'experimental'
-  - 'database'
-  - 'textmining'
-  - 'combined_score' : 'int'
+metadata: !include './examples/string/metadata.yaml'
+
+columns: !include './examples/standards/string.yaml'
 
 filters:
   - inclusion: 'include'

diff --git a/examples/xenbase/gene-information.yaml b/examples/xenbase/gene-information.yaml
@@ -3,11 +3,12 @@ name: 'gene-information'
 files:
   - './examples/data/xenbase.gpi.gz'
 
-standard_format: 'gpi'
+# standard_format: 'gpi'
+columns: !include './examples/standards/gpi.yaml'
 
 compression: 'gzip'
 
-skip_lines: 22
+header: 23
 
 metadata: './examples/xenbase/metadata.yaml'
 

diff --git a/examples/xenbase/gene-literature.yaml b/examples/xenbase/gene-literature.yaml
@@ -2,9 +2,7 @@ name: 'gene-literature'
 
 delimiter: '\t'
 
-has_header: False
-
-metadata: './examples/xenbase/metadata.yaml'
+metadata: !include './examples/xenbase/metadata.yaml'
 
 files:
   - './examples/data/LiteratureMatchedGenesByPaper.txt'

diff --git a/examples/xenbase/gene-to-phenotype.yaml b/examples/xenbase/gene-to-phenotype.yaml
@@ -3,9 +3,9 @@ name: 'gene-to-phenotype'
 files:
   - './examples/data/xb_xpo_spo_v20210511b.csv'
 
-metadata: './examples/xenbase/metadata.yaml'
+metadata: !include './examples/xenbase/metadata.yaml'
 
-standard_format: 'oban'
+columns: !include './examples/standards/oban.yaml'
 
 node_properties:
   - 'id'

diff --git a/koza/__init__.py b/koza/__init__.py
@@ -1,2 +1,2 @@
-"""Koza, an ETL framework for the Biolink model"""
-__version__ = '0.1.1'
+"""Koza, an ETL framework for LinkML data models"""
+__version__ = '0.1.3'
diff --git a/koza/app.py b/koza/app.py
@@ -11,6 +11,7 @@
 from koza.io.writer.jsonl_writer import JSONLWriter
 from koza.io.writer.tsv_writer import TSVWriter
 from koza.io.writer.writer import KozaWriter
+from koza.io.yaml_loader import UniqueIncludeLoader
 from koza.model.config.source_config import MapFileConfig, OutputFormat
 from koza.model.curie_cleaner import CurieCleaner
 from koza.model.map_dict import MapDict
@@ -49,7 +50,9 @@ def __init__(
         if source.config.depends_on is not None:
             for map_file in source.config.depends_on:
                 with open(map_file, 'r') as map_file_fh:
-                    map_file_config = MapFileConfig(**yaml.safe_load(map_file_fh))
+                    map_file_config = MapFileConfig(
+                        **yaml.load(map_file_fh, Loader=UniqueIncludeLoader)
+                    )
                     map_file_config.transform_code = (
                         str(Path(map_file).parent / Path(map_file).stem) + '.py'
                     )
@@ -180,3 +183,17 @@ def _load_map(self, map_file_config: MapFileConfig):
                 map[row[key_column]] = {
                     key: value for key, value in row.items() if key in value_columns
                 }
+
+    @staticmethod
+    def _map_sniffer(depends_on: str):
+        """
+        TODO a utility function to determine if a depends_on string
+        is a path to a map config file, a yaml file that should be
+        interpreted as a dictionary, or a json file that should be
+        interpreted as a dictionary
+
+        See https://github.com/monarch-initiative/koza/issues/39
+
+        :param depends_on:
+        :return:
+        """
diff --git a/koza/cli_runner.py b/koza/cli_runner.py
@@ -13,6 +13,7 @@
 from koza.io.reader.json_reader import JSONReader
 from koza.io.reader.jsonl_reader import JSONLReader
 from koza.io.utils import open_resource
+from koza.io.yaml_loader import UniqueIncludeLoader
 from koza.model.config.source_config import (
     CompressionType,
     FormatType,
@@ -36,7 +37,7 @@ def set_koza_app(
     source: Source,
     translation_table: TranslationTable = None,
     output_dir: str = './output',
-    output_format: OutputFormat = OutputFormat('jsonl'),
+    output_format: OutputFormat = OutputFormat('tsv'),
 ) -> KozaApp:
     """
     Setter for singleton koza app object
@@ -124,15 +125,15 @@ def get_translation_table(global_table: str = None, local_table: str = None) ->
 def transform_source(
     source: str,
     output_dir: str,
-    output_format: OutputFormat,
+    output_format: OutputFormat = OutputFormat('tsv'),
     global_table: str = None,
     local_table: str = None,
 ):
 
     translation_table = get_translation_table(global_table, local_table)
 
     with open(source, 'r') as source_fh:
-        source_config = PrimaryFileConfig(**yaml.safe_load(source_fh))
+        source_config = PrimaryFileConfig(**yaml.load(source_fh, Loader=UniqueIncludeLoader))
         if not source_config.name:
             source_config.name = Path(source).stem
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,8 +4,6 @@ metadata: @@
     delimiter: '\t'
-    has_header: 'False'
     files:
       - './examples/data/XenbaseGenepageToGeneIdMapping.txt'
@@ Expand Down @@