diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 34b599f..f3e1297 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -24,4 +24,4 @@ jobs: FLIT_USERNAME: ${{ secrets.PYPI_USERNAME }} FLIT_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - make publish \ No newline at end of file + make publish diff --git a/Makefile b/Makefile index b618280..63554ab 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,7 @@ clean: rm -rf `find . -name __pycache__` rm -f `find . -type f -name '*.py[co]' ` rm -rf .pytest_cache + rm -rf test-output rm -rf dist .PHONY: lint diff --git a/docs/index.md b/docs/index.md index f96cd7e..13e69ca 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,9 +2,9 @@ ![pupa](img/pupa.png) -*Disclaimer*: Koza is in beta; we are looking for beta testers +*Disclaimer*: Koza is in beta -Transform csv, json, yaml, jsonl, and xml and converting them to a target +Transform csv, json, jsonl, and yaml - converting them to a target csv, json, or jsonl format based on your dataclass model. Koza also can output data in the [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md#kgx-format-as-tsv) @@ -17,13 +17,19 @@ data in the [KGX format](https://github.com/biolink/kgx/blob/master/specificatio - Create and use translation tables to map between source and target vocabularies -#### Installation +### Installation ``` pip install koza ``` -#### Getting Started +### Getting Started + +#### Writing an ingest + +[Ingest Configuration](ingest_configuration.md) + +#### Running an ingest Send a local or remove csv file through Koza to get some basic information (headers, number of rows) @@ -47,7 +53,7 @@ koza validate \ --compression gzip ``` -###### Example: transforming StringDB +##### Example: transforming StringDB ```bash koza transform --source examples/string/metadata.yaml diff --git a/docs/ingest_configuration.md b/docs/ingest_configuration.md index 6344059..152bfd0 100644 --- a/docs/ingest_configuration.md +++ b/docs/ingest_configuration.md @@ -1,23 +1,6 @@ ## Ingest Configuration -### Source (aka metadata.yaml) - -The Source File provides metadata for the description of the dataset and the list of Source Files to be ingested - -```yaml -name: 'somethingbase' - -dataset_description: - ingest_title: 'SomethingBase' - ingest_url: 'https://somethingbase.org' - description: 'SomethingBase: A Website With Some Data' - rights: 'https://somethingbase.org/rights.html' - -# The list of source files should map -source_files: - - 'gene-information.yaml' - - 'gene-to-phenotype.yaml' -``` +Ingests are configured via a single source file yaml, and optional mapping file yaml(s) ### Source File(s) @@ -36,6 +19,12 @@ files: - './data/really-cool-data-1.json.gz' - './data/really-cool-data-2.json.gz' +# The dataset description +metadata: + description: 'SomethingBase: A Website With Some Data' + rights: 'https://somethingbase.org/rights.html' + + # in a JSON ingest, this will be the path to the array to be iterated over as the input collection json_path: - 'data' @@ -105,3 +94,19 @@ transform_mode: 'loop' # You probably don't need to set this property transform_code: 'name-of-ingest.py' ``` + +### Map File(s) + +This YAML file sets properties for creating a dictionary that an ingest depends on. +It contains the same options as a source file, excluding depends_on, node_properties, +edge_properties, and on_map_failure. It adds the following options: + +```yaml +# The column name in which to get the key for the dictionary +key: someKey + +# The column(s) in which to store as values for the key +values: + - value1 + - value2 +``` diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 016bb16..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -mkdocs diff --git a/examples/maps/custom-entrez-2-string.yaml b/examples/maps/custom-entrez-2-string.yaml index a0db0b9..41bc015 100644 --- a/examples/maps/custom-entrez-2-string.yaml +++ b/examples/maps/custom-entrez-2-string.yaml @@ -3,8 +3,6 @@ name: 'custom_entrez_2_string' metadata: description: 'Mapping file provided by StringDB that contains entrez to protein ID mappings' -source: 'stringdb' - delimiter: '\t' header_delimiter: '/' diff --git a/examples/maps/entrez-2-string.yaml b/examples/maps/entrez-2-string.yaml index 4c73e74..223ccde 100644 --- a/examples/maps/entrez-2-string.yaml +++ b/examples/maps/entrez-2-string.yaml @@ -3,8 +3,6 @@ name: 'entrez_2_string' metadata: description: 'Mapping file provided by StringDB that contains entrez to protein ID mappings' -source: 'stringdb' - delimiter: '\t' header_delimiter: '/' diff --git a/examples/maps/genepage-2-gene.yaml b/examples/maps/genepage-2-gene.yaml index 6fa039e..4a23f91 100644 --- a/examples/maps/genepage-2-gene.yaml +++ b/examples/maps/genepage-2-gene.yaml @@ -2,8 +2,6 @@ name: 'genepage-2-gene' metadata: description: 'Mapping file provided by Xenbase that maps from GENEPAGE to GENE' -source: 'Xenbase' - delimiter: '\t' has_header: 'False' diff --git a/koza/model/config/source_config.py b/koza/model/config/source_config.py index 87ec35d..97410e3 100644 --- a/koza/model/config/source_config.py +++ b/koza/model/config/source_config.py @@ -121,6 +121,13 @@ class ColumnFilter: @dataclass(frozen=True) class DatasetDescription: + """ + These options should be treated as being in alpha, as we need + to align with various efforts (hcls, translator infores) + + These currently do not serve a purpose in koza other + than documentation + """ id: str = None # TODO constrain to a curie? name: str = None # If empty use source name ingest_title: str = None # Map to biolink name @@ -139,8 +146,6 @@ class SourceConfig: TODO document fields - TODO override translation table path? - delimiter: separator string similar to what works in str.split() https://docs.python.org/3/library/stdtypes.html#str.split @@ -154,8 +159,6 @@ class SourceConfig: standard_format: StandardFormat = None metadata: Union[DatasetDescription, str] = None columns: List[Union[str, Dict[str, FieldType]]] = None - node_properties: List[str] = None - edge_properties: List[str] = None required_properties: List[str] = None delimiter: str = None header_delimiter: str = None @@ -300,13 +303,14 @@ def field_type_map(self): @dataclass(config=PydanticConfig) class PrimaryFileConfig(SourceConfig): + node_properties: List[str] = None + edge_properties: List[str] = None depends_on: List[str] = field(default_factory=list) on_map_failure: MapErrorEnum = MapErrorEnum.warning @dataclass(config=PydanticConfig) class MapFileConfig(SourceConfig): - source: str = None key: str = None values: List[str] = None curie_prefix: str = None