clean up config docs given changes

monarch-initiative · Sep 27, 2021 · efefe50 · efefe50
1 parent b76b15b
commit efefe50
Show file tree

Hide file tree

Showing 9 changed files with 45 additions and 36 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -24,4 +24,4 @@ jobs:
                   FLIT_USERNAME: ${{ secrets.PYPI_USERNAME }}
                   FLIT_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
               run: |
-                  make publish
+                  make publish
diff --git a/Makefile b/Makefile
@@ -39,6 +39,7 @@ clean:
 	rm -rf `find . -name __pycache__`
 	rm -f `find . -type f -name '*.py[co]' `
 	rm -rf .pytest_cache
+	rm -rf test-output
 	rm -rf dist
 
 .PHONY: lint

diff --git a/docs/index.md b/docs/index.md
@@ -2,9 +2,9 @@
 
 ![pupa](img/pupa.png)
 
-*Disclaimer*: Koza is in beta; we are looking for beta testers
+*Disclaimer*: Koza is in beta
 
-Transform csv, json, yaml, jsonl, and xml and converting them to a target
+Transform csv, json, jsonl, and yaml - converting them to a target
 csv, json, or jsonl format based on your dataclass model.  Koza also can output
 data in the [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv)
 
@@ -17,13 +17,19 @@ data in the [KGX format](https://github.com/biolink/kgx/blob/master/specificatio
 - Create and use translation tables to map between source and target vocabularies
 
 
-#### Installation
+### Installation
 
 ```
 pip install koza
 ```
 
-#### Getting Started
+### Getting Started
+
+#### Writing an ingest
+
+[Ingest Configuration](ingest_configuration.md)
+
+#### Running an ingest
 
 Send a local or remove csv file through Koza to get some basic information (headers, number of rows)
 
@@ -47,7 +53,7 @@ koza validate \
   --compression gzip
 ```
 
-###### Example: transforming StringDB
+##### Example: transforming StringDB
 
 ```bash
 koza transform --source examples/string/metadata.yaml 

diff --git a/docs/ingest_configuration.md b/docs/ingest_configuration.md
@@ -1,23 +1,6 @@
 ## Ingest Configuration
 
-### Source (aka metadata.yaml)
-
-The Source File provides metadata for the description of the dataset and the list of Source Files to be ingested
-
-```yaml
-name: 'somethingbase'
-
-dataset_description:
-  ingest_title: 'SomethingBase'
-  ingest_url: 'https://somethingbase.org'
-  description: 'SomethingBase: A Website With Some Data'
-  rights: 'https://somethingbase.org/rights.html'
-
-# The list of source files should map 
-source_files:
-  - 'gene-information.yaml'
-  - 'gene-to-phenotype.yaml'
-```
+Ingests are configured via a single source file yaml, and optional mapping file yaml(s)
 
 ### Source File(s)
 
@@ -36,6 +19,12 @@ files:
   - './data/really-cool-data-1.json.gz'
   - './data/really-cool-data-2.json.gz'
 
+# The dataset description
+metadata:
+  description: 'SomethingBase: A Website With Some Data'
+  rights: 'https://somethingbase.org/rights.html'
+
+
 # in a JSON ingest, this will be the path to the array to be iterated over as the input collection
 json_path:
   - 'data'
@@ -105,3 +94,19 @@ transform_mode: 'loop'
 # You probably don't need to set this property
 transform_code: 'name-of-ingest.py'
 ```
+
+### Map File(s)
+
+This YAML file sets properties for creating a dictionary that an ingest depends on.
+It contains the same options as a source file, excluding depends_on, node_properties,
+edge_properties, and on_map_failure.  It adds the following options:
+
+```yaml
+# The column name in which to get the key for the dictionary
+key: someKey
+
+# The column(s) in which to store as values for the key
+values:
+  - value1
+  - value2
+```
diff --git a/docs/requirements.txt b/docs/requirements.txt
diff --git a/examples/maps/custom-entrez-2-string.yaml b/examples/maps/custom-entrez-2-string.yaml
@@ -3,8 +3,6 @@ name: 'custom_entrez_2_string'
 metadata:
   description: 'Mapping file provided by StringDB that contains entrez to protein ID mappings'
 
-source: 'stringdb'
-
 delimiter: '\t'
 header_delimiter: '/'
 

diff --git a/examples/maps/entrez-2-string.yaml b/examples/maps/entrez-2-string.yaml
@@ -3,8 +3,6 @@ name: 'entrez_2_string'
 metadata:
   description: 'Mapping file provided by StringDB that contains entrez to protein ID mappings'
 
-source: 'stringdb'
-
 delimiter: '\t'
 header_delimiter: '/'
 

diff --git a/examples/maps/genepage-2-gene.yaml b/examples/maps/genepage-2-gene.yaml
@@ -2,8 +2,6 @@ name: 'genepage-2-gene'
 metadata:
   description: 'Mapping file provided by Xenbase that maps from GENEPAGE to GENE'
 
-source: 'Xenbase'
-
 delimiter: '\t'
 
 has_header: 'False'

diff --git a/koza/model/config/source_config.py b/koza/model/config/source_config.py
@@ -121,6 +121,13 @@ class ColumnFilter:
 
 @dataclass(frozen=True)
 class DatasetDescription:
+    """
+    These options should be treated as being in alpha, as we need
+    to align with various efforts (hcls, translator infores)
+
+    These currently do not serve a purpose in koza other
+    than documentation
+    """
     id: str = None  # TODO constrain to a curie?
     name: str = None  # If empty use source name
     ingest_title: str = None  # Map to biolink name
@@ -139,8 +146,6 @@ class SourceConfig:
 
     TODO document fields
 
-    TODO override translation table path?
-
     delimiter:
     separator string similar to what works in str.split()
     https://docs.python.org/3/library/stdtypes.html#str.split
@@ -154,8 +159,6 @@ class SourceConfig:
     standard_format: StandardFormat = None
     metadata: Union[DatasetDescription, str] = None
     columns: List[Union[str, Dict[str, FieldType]]] = None
-    node_properties: List[str] = None
-    edge_properties: List[str] = None
     required_properties: List[str] = None
     delimiter: str = None
     header_delimiter: str = None
@@ -300,13 +303,14 @@ def field_type_map(self):
 
 @dataclass(config=PydanticConfig)
 class PrimaryFileConfig(SourceConfig):
+    node_properties: List[str] = None
+    edge_properties: List[str] = None
     depends_on: List[str] = field(default_factory=list)
     on_map_failure: MapErrorEnum = MapErrorEnum.warning
 
 
 @dataclass(config=PydanticConfig)
 class MapFileConfig(SourceConfig):
-    source: str = None
     key: str = None
     values: List[str] = None
     curie_prefix: str = None