diff --git a/.github/workflows/test.yml b/.github/workflows/build.yml similarity index 97% rename from .github/workflows/test.yml rename to .github/workflows/build.yml index ef3763c..c841610 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/build.yml @@ -1,6 +1,6 @@ # Builds and runs pytest on ubuntu-latest # Tests python versions >=3.6 -name: run tests +name: Build on: push: diff --git a/README.md b/README.md index d765e18..ca58670 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +[![Pyversions](https://img.shields.io/pypi/pyversions/koza.svg)](https://pypi.python.org/pypi/koza) +![](https://github.com/monarch-initiative/koza/actions/workflows/build.yml/badge.svg) +[![PyPi](https://img.shields.io/pypi/v/koza.svg)](https://pypi.python.org/pypi/koza) + ### Koza ![pupa](docs/img/pupa.png) Data transformation framework diff --git a/examples/maps/entrez-2-string.yaml b/examples/maps/entrez-2-string.yaml index 223ccde..e337a2f 100644 --- a/examples/maps/entrez-2-string.yaml +++ b/examples/maps/entrez-2-string.yaml @@ -5,6 +5,7 @@ metadata: delimiter: '\t' header_delimiter: '/' +header: 0 # Assumes that no identifiers are overlapping # otherwise these should go into separate configs diff --git a/examples/maps/genepage-2-gene.yaml b/examples/maps/genepage-2-gene.yaml index 4a23f91..a7151ef 100644 --- a/examples/maps/genepage-2-gene.yaml +++ b/examples/maps/genepage-2-gene.yaml @@ -4,8 +4,6 @@ metadata: delimiter: '\t' -has_header: 'False' - files: - './examples/data/XenbaseGenepageToGeneIdMapping.txt' diff --git a/examples/xenbase/gene-information.yaml b/examples/xenbase/gene-information.yaml index aa691c4..7c0d9b2 100644 --- a/examples/xenbase/gene-information.yaml +++ b/examples/xenbase/gene-information.yaml @@ -7,7 +7,7 @@ standard_format: 'gpi' compression: 'gzip' -skip_lines: 22 +header: 23 metadata: './examples/xenbase/metadata.yaml' diff --git a/examples/xenbase/gene-literature.yaml b/examples/xenbase/gene-literature.yaml index 974111a..7a2ff57 100644 --- a/examples/xenbase/gene-literature.yaml +++ b/examples/xenbase/gene-literature.yaml @@ -2,8 +2,6 @@ name: 'gene-literature' delimiter: '\t' -has_header: False - metadata: './examples/xenbase/metadata.yaml' files: diff --git a/koza/__init__.py b/koza/__init__.py index c7f3e91..c91b4c3 100644 --- a/koza/__init__.py +++ b/koza/__init__.py @@ -1,2 +1,2 @@ -"""Koza, an ETL framework for the Biolink model""" -__version__ = '0.1.1' +"""Koza, an ETL framework for LinkML data models""" +__version__ = '0.1.2' diff --git a/koza/io/reader/csv_reader.py b/koza/io/reader/csv_reader.py index 9d1984f..fda7031 100644 --- a/koza/io/reader/csv_reader.py +++ b/koza/io/reader/csv_reader.py @@ -1,8 +1,8 @@ import logging from csv import reader -from typing import IO, Any, Dict, Iterator +from typing import IO, Any, Dict, Iterator, List, Union -from koza.model.config.source_config import FieldType +from koza.model.config.source_config import FieldType, HeaderMode LOG = logging.getLogger(__name__) @@ -45,10 +45,9 @@ def __init__( io_str: IO[str], field_type_map: Dict[str, FieldType] = None, delimiter: str = ",", - has_header: bool = True, + header: Union[int, HeaderMode] = HeaderMode.infer, header_delimiter: str = None, dialect: str = "excel", - skip_lines: int = 0, skip_blank_lines: bool = True, name: str = "csv file", *args, @@ -57,26 +56,30 @@ def __init__( """ :param io_str: Any IO stream that yields a string See https://docs.python.org/3/library/io.html#io.IOBase - :param name: filename or alias :param field_type_map: A dictionary of field names and their type (using the FieldType enum) :param delimiter: Field delimiter (eg. '\t' ',' ' ') - :param has_header: true if the file has a header, default=True + :param header: 0 based index of the file that contains the header, + or header mode 'infer'|'none' ( default='infer' ) + if 'infer' will use the first non-empty and uncommented line + if 'none' will use the field_type_map keys, if field_type_map :param header_delimiter: delimiter for the header row, default = self.delimiter :param dialect: csv dialect, default=excel + :param skip_blank_lines: true to skip blank lines, false to insert NaN for blank lines, + :param name: filename or alias :param args: additional args to pass to csv.reader :param kwargs: additional kwargs to pass to csv.reader """ self.io_str = io_str self.field_type_map = field_type_map self.dialect = dialect - self.has_header = has_header + self.header = header self.header_delimiter = header_delimiter if header_delimiter else delimiter - self.skip_lines = skip_lines self.skip_blank_lines = skip_blank_lines self.name = name self.line_num = 0 - self.fieldnames = [] + + self._header = None if delimiter == '\\s': delimiter = ' ' @@ -90,67 +93,8 @@ def __iter__(self) -> Iterator: def __next__(self) -> Dict[str, Any]: - while self.line_num < self.skip_lines: - next(self.reader) - self.line_num = self.reader.line_num - - if self.line_num == self.skip_lines: - - if not self.has_header and not self.field_type_map: - raise ValueError( - f"there is no header and columns have not been supplied\n" - f"configure the 'columns' property in the source yaml" - ) - - if self.has_header: - fieldnames = next( - reader( - self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect} - ) - ) - # todo: maybe comment character should be specified? - fieldnames[0] = fieldnames[0].lstrip('#') - fieldnames[0] = fieldnames[0].lstrip('!!') - fieldnames = [f.strip() for f in fieldnames] - else: - fieldnames = list(self.field_type_map.keys()) - - self.fieldnames = fieldnames - - if self.field_type_map: - - configured_fields = list(self.field_type_map.keys()) - - if set(configured_fields) > set(fieldnames): - raise ValueError( - f"Configured columns missing in source file {self.name}\n" - f"{set(configured_fields) - set(fieldnames)}" - ) - - if set(fieldnames) > set(configured_fields): - LOG.warning( - f"Additional column(s) in source file {self.name}\n" - f"{set(fieldnames) - set(configured_fields)}\n" - f"Checking if new column(s) inserted at end of the row" - ) - # add to type map - for new_fields in set(fieldnames) - set(configured_fields): - self.field_type_map[new_fields] = FieldType.str - - # Check if the additional columns are appended - # not sure if this would useful or just noise - if fieldnames[: len(configured_fields)] != configured_fields: - LOG.warning( - f"Additional columns located within configured fields\n" - f"given: {configured_fields}\n" - f"found: {fieldnames}" - ) - else: - self.field_type_map = {field: FieldType.str for field in fieldnames} - LOG.info(f"No headers supplied for {self.name}, found {fieldnames}") - - else: - self.fieldnames = self.field_type_map.keys() + if not self._header: + self._set_header() try: row = next(self.reader) @@ -163,6 +107,8 @@ def __next__(self) -> Dict[str, Any]: if self.skip_blank_lines: while not row: row = next(self.reader) + else: + row = ['NaN' for _ in range(len(self._header))] # Check row length discrepancies for each row # TODO currently varying line lengths will raise an exception @@ -170,11 +116,12 @@ def __next__(self) -> Dict[str, Any]: # out which lines vary # Could also create a custom exception and allow the client code # to determine what to do here - fields_len = len(self.fieldnames) + fields_len = len(self._header) row_len = len(row) + stripped_row = [val.strip() for val in row] # if we've made it here we can convert a row to a dict - field_map = dict(zip(self.fieldnames, row)) + field_map = dict(zip(self._header, stripped_row)) if fields_len > row_len: raise ValueError( @@ -204,3 +151,86 @@ def __next__(self) -> Dict[str, Any]: LOG.warning(key_error) return typed_field_map + + def _set_header(self): + if isinstance(self.header, int): + while self.line_num < self.header: + next(self.reader) + self.line_num = self.reader.line_num + self._header = self._parse_header_line() + + if self.field_type_map: + self._compare_headers_to_supplied_columns() + else: + self.field_type_map = {field: FieldType.str for field in self._header} + + elif self.header == 'infer': + self._header = self._parse_header_line(skip_blank_or_commented_lines=True) + LOG.info(f"headers for {self.name} parsed as {self._header}") + if self.field_type_map: + self._compare_headers_to_supplied_columns() + else: + self.field_type_map = {field: FieldType.str for field in self._header} + + elif self.header == 'none': + if self.field_type_map: + self._header = list(self.field_type_map.keys()) + else: + raise ValueError( + f"there is no header and columns have not been supplied\n" + f"configure the 'columns' property in the source yaml" + ) + + def _parse_header_line(self, skip_blank_or_commented_lines:bool = False) -> List[str]: + """ + Parse the header line and return a list of headers + """ + fieldnames = next( + reader( + self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect} + ) + ) + if skip_blank_or_commented_lines: + while not fieldnames or fieldnames[0].startswith('#'): + fieldnames = next( + reader( + self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect} + ) + ) + + # todo: maybe comment character should be specified? + fieldnames[0] = fieldnames[0].lstrip('#') + fieldnames[0] = fieldnames[0].lstrip('!!') + return [f.strip() for f in fieldnames] + + def _compare_headers_to_supplied_columns(self): + """ + Compares headers to supplied columns + :return: + """ + configured_fields = list(self.field_type_map.keys()) + + if set(configured_fields) > set(self._header): + raise ValueError( + f"Configured columns missing in source file {self.name}\n" + f"{set(configured_fields) - set(self._header)}" + ) + + if set(self._header) > set(configured_fields): + LOG.warning( + f"Additional column(s) in source file {self.name}\n" + f"{set(self._header) - set(configured_fields)}\n" + f"Checking if new column(s) inserted at end of the row" + ) + # add to type map + for new_fields in set(self._header) - set(configured_fields): + self.field_type_map[new_fields] = FieldType.str + + # Check if the additional columns are appended + # not sure if this would useful or just noise + if self._header[: len(configured_fields)] != configured_fields: + LOG.warning( + f"Additional columns located within configured fields\n" + f"given: {configured_fields}\n" + f"found: {self._header}" + ) diff --git a/koza/model/config/source_config.py b/koza/model/config/source_config.py index 369daaf..dba1bc3 100644 --- a/koza/model/config/source_config.py +++ b/koza/model/config/source_config.py @@ -9,7 +9,7 @@ from typing import Dict, List, Union import yaml -from pydantic import StrictFloat, StrictInt, StrictStr +from pydantic import StrictFloat, StrictInt, StrictStr, StrictBool from pydantic.dataclasses import dataclass from koza.model.config.pydantic_config import PydanticConfig @@ -111,6 +111,15 @@ class TransformMode(str, Enum): loop = 'loop' +class HeaderMode(str, Enum): + """ + Enum for supported header modes in addition to an index based lookup + """ + + infer = 'infer' + none = 'none' + + @dataclass(frozen=True) class ColumnFilter: column: str @@ -147,6 +156,12 @@ class SourceConfig: TODO document fields + header: Optional, int|HeaderMode - the index (0 based) in which the + header appears in the file. If header is set to infer + the headers will be set to the first line that is not blank + or commented with a hash. If header is set to 'none' + then the columns field will be used + delimiter: separator string similar to what works in str.split() https://docs.python.org/3/library/stdtypes.html#str.split @@ -163,8 +178,7 @@ class SourceConfig: required_properties: List[str] = None delimiter: str = None header_delimiter: str = None - has_header: bool = True - skip_lines: int = 0 + header: Union[int, HeaderMode] = HeaderMode.infer skip_blank_lines: bool = True compression: CompressionType = None filters: List[ColumnFilter] = field(default_factory=list) diff --git a/koza/model/source.py b/koza/model/source.py index 45dc01b..a5f603a 100644 --- a/koza/model/source.py +++ b/koza/model/source.py @@ -51,8 +51,7 @@ def __init__( field_type_map=config.field_type_map, delimiter=config.delimiter, header_delimiter=config.header_delimiter, - has_header=config.has_header, - skip_lines=config.skip_lines, + header=config.header, ) ) elif self.config.format == 'jsonl': diff --git a/tests/unit/resources/primary-source.yaml b/tests/unit/resources/primary-source.yaml index 467fd93..d520837 100644 --- a/tests/unit/resources/primary-source.yaml +++ b/tests/unit/resources/primary-source.yaml @@ -13,8 +13,6 @@ header_delimiter: '\t' compression: 'gzip' -skip_lines: 0 - files: - '9606.protein.links.detailed.v11.0.txt.gz' - '10090.protein.links.detailed.v11.0.txt.gz'