Skip to content

Commit

Permalink
first pass at #40
Browse files Browse the repository at this point in the history
  • Loading branch information
kshefchek committed Oct 11, 2021
1 parent 81d4cda commit 7b20646
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 87 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml → .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Builds and runs pytest on ubuntu-latest
# Tests python versions >=3.6
name: run tests
name: Build

on:
push:
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[![Pyversions](https://img.shields.io/pypi/pyversions/koza.svg)](https://pypi.python.org/pypi/koza)
![](https://github.com/monarch-initiative/koza/actions/workflows/build.yml/badge.svg)
[![PyPi](https://img.shields.io/pypi/v/koza.svg)](https://pypi.python.org/pypi/koza)

### Koza

![pupa](docs/img/pupa.png) Data transformation framework
Expand Down
1 change: 1 addition & 0 deletions examples/maps/entrez-2-string.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ metadata:

delimiter: '\t'
header_delimiter: '/'
header: 0

# Assumes that no identifiers are overlapping
# otherwise these should go into separate configs
Expand Down
2 changes: 0 additions & 2 deletions examples/maps/genepage-2-gene.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ metadata:

delimiter: '\t'

has_header: 'False'

files:
- './examples/data/XenbaseGenepageToGeneIdMapping.txt'

Expand Down
2 changes: 1 addition & 1 deletion examples/xenbase/gene-information.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ standard_format: 'gpi'

compression: 'gzip'

skip_lines: 22
header: 23

metadata: './examples/xenbase/metadata.yaml'

Expand Down
2 changes: 0 additions & 2 deletions examples/xenbase/gene-literature.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ name: 'gene-literature'

delimiter: '\t'

has_header: False

metadata: './examples/xenbase/metadata.yaml'

files:
Expand Down
4 changes: 2 additions & 2 deletions koza/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Koza, an ETL framework for the Biolink model"""
__version__ = '0.1.1'
"""Koza, an ETL framework for LinkML data models"""
__version__ = '0.1.2'
174 changes: 102 additions & 72 deletions koza/io/reader/csv_reader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
from csv import reader
from typing import IO, Any, Dict, Iterator
from typing import IO, Any, Dict, Iterator, List, Union

from koza.model.config.source_config import FieldType
from koza.model.config.source_config import FieldType, HeaderMode

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -45,10 +45,9 @@ def __init__(
io_str: IO[str],
field_type_map: Dict[str, FieldType] = None,
delimiter: str = ",",
has_header: bool = True,
header: Union[int, HeaderMode] = HeaderMode.infer,
header_delimiter: str = None,
dialect: str = "excel",
skip_lines: int = 0,
skip_blank_lines: bool = True,
name: str = "csv file",
*args,
Expand All @@ -57,26 +56,30 @@ def __init__(
"""
:param io_str: Any IO stream that yields a string
See https://docs.python.org/3/library/io.html#io.IOBase
:param name: filename or alias
:param field_type_map: A dictionary of field names and their type (using the FieldType enum)
:param delimiter: Field delimiter (eg. '\t' ',' ' ')
:param has_header: true if the file has a header, default=True
:param header: 0 based index of the file that contains the header,
or header mode 'infer'|'none' ( default='infer' )
if 'infer' will use the first non-empty and uncommented line
if 'none' will use the field_type_map keys, if field_type_map
:param header_delimiter: delimiter for the header row, default = self.delimiter
:param dialect: csv dialect, default=excel
:param skip_blank_lines: true to skip blank lines, false to insert NaN for blank lines,
:param name: filename or alias
:param args: additional args to pass to csv.reader
:param kwargs: additional kwargs to pass to csv.reader
"""
self.io_str = io_str
self.field_type_map = field_type_map
self.dialect = dialect
self.has_header = has_header
self.header = header
self.header_delimiter = header_delimiter if header_delimiter else delimiter
self.skip_lines = skip_lines
self.skip_blank_lines = skip_blank_lines
self.name = name

self.line_num = 0
self.fieldnames = []

self._header = None

if delimiter == '\\s':
delimiter = ' '
Expand All @@ -90,67 +93,8 @@ def __iter__(self) -> Iterator:

def __next__(self) -> Dict[str, Any]:

while self.line_num < self.skip_lines:
next(self.reader)
self.line_num = self.reader.line_num

if self.line_num == self.skip_lines:

if not self.has_header and not self.field_type_map:
raise ValueError(
f"there is no header and columns have not been supplied\n"
f"configure the 'columns' property in the source yaml"
)

if self.has_header:
fieldnames = next(
reader(
self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect}
)
)
# todo: maybe comment character should be specified?
fieldnames[0] = fieldnames[0].lstrip('#')
fieldnames[0] = fieldnames[0].lstrip('!!')
fieldnames = [f.strip() for f in fieldnames]
else:
fieldnames = list(self.field_type_map.keys())

self.fieldnames = fieldnames

if self.field_type_map:

configured_fields = list(self.field_type_map.keys())

if set(configured_fields) > set(fieldnames):
raise ValueError(
f"Configured columns missing in source file {self.name}\n"
f"{set(configured_fields) - set(fieldnames)}"
)

if set(fieldnames) > set(configured_fields):
LOG.warning(
f"Additional column(s) in source file {self.name}\n"
f"{set(fieldnames) - set(configured_fields)}\n"
f"Checking if new column(s) inserted at end of the row"
)
# add to type map
for new_fields in set(fieldnames) - set(configured_fields):
self.field_type_map[new_fields] = FieldType.str

# Check if the additional columns are appended
# not sure if this would useful or just noise
if fieldnames[: len(configured_fields)] != configured_fields:
LOG.warning(
f"Additional columns located within configured fields\n"
f"given: {configured_fields}\n"
f"found: {fieldnames}"
)
else:
self.field_type_map = {field: FieldType.str for field in fieldnames}
LOG.info(f"No headers supplied for {self.name}, found {fieldnames}")

else:
self.fieldnames = self.field_type_map.keys()
if not self._header:
self._set_header()

try:
row = next(self.reader)
Expand All @@ -163,18 +107,21 @@ def __next__(self) -> Dict[str, Any]:
if self.skip_blank_lines:
while not row:
row = next(self.reader)
else:
row = ['NaN' for _ in range(len(self._header))]

# Check row length discrepancies for each row
# TODO currently varying line lengths will raise an exception
# and hard fail, we should probably make these warnings and report
# out which lines vary
# Could also create a custom exception and allow the client code
# to determine what to do here
fields_len = len(self.fieldnames)
fields_len = len(self._header)
row_len = len(row)
stripped_row = [val.strip() for val in row]

# if we've made it here we can convert a row to a dict
field_map = dict(zip(self.fieldnames, row))
field_map = dict(zip(self._header, stripped_row))

if fields_len > row_len:
raise ValueError(
Expand Down Expand Up @@ -204,3 +151,86 @@ def __next__(self) -> Dict[str, Any]:
LOG.warning(key_error)

return typed_field_map

def _set_header(self):
if isinstance(self.header, int):
while self.line_num < self.header:
next(self.reader)
self.line_num = self.reader.line_num
self._header = self._parse_header_line()

if self.field_type_map:
self._compare_headers_to_supplied_columns()
else:
self.field_type_map = {field: FieldType.str for field in self._header}

elif self.header == 'infer':
self._header = self._parse_header_line(skip_blank_or_commented_lines=True)
LOG.info(f"headers for {self.name} parsed as {self._header}")
if self.field_type_map:
self._compare_headers_to_supplied_columns()
else:
self.field_type_map = {field: FieldType.str for field in self._header}

elif self.header == 'none':
if self.field_type_map:
self._header = list(self.field_type_map.keys())
else:
raise ValueError(
f"there is no header and columns have not been supplied\n"
f"configure the 'columns' property in the source yaml"
)

def _parse_header_line(self, skip_blank_or_commented_lines:bool = False) -> List[str]:
"""
Parse the header line and return a list of headers
"""
fieldnames = next(
reader(
self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect}
)
)
if skip_blank_or_commented_lines:
while not fieldnames or fieldnames[0].startswith('#'):
fieldnames = next(
reader(
self.io_str, **{'delimiter': self.header_delimiter, 'dialect': self.dialect}
)
)

# todo: maybe comment character should be specified?
fieldnames[0] = fieldnames[0].lstrip('#')
fieldnames[0] = fieldnames[0].lstrip('!!')
return [f.strip() for f in fieldnames]

def _compare_headers_to_supplied_columns(self):
"""
Compares headers to supplied columns
:return:
"""
configured_fields = list(self.field_type_map.keys())

if set(configured_fields) > set(self._header):
raise ValueError(
f"Configured columns missing in source file {self.name}\n"
f"{set(configured_fields) - set(self._header)}"
)

if set(self._header) > set(configured_fields):
LOG.warning(
f"Additional column(s) in source file {self.name}\n"
f"{set(self._header) - set(configured_fields)}\n"
f"Checking if new column(s) inserted at end of the row"
)
# add to type map
for new_fields in set(self._header) - set(configured_fields):
self.field_type_map[new_fields] = FieldType.str

# Check if the additional columns are appended
# not sure if this would useful or just noise
if self._header[: len(configured_fields)] != configured_fields:
LOG.warning(
f"Additional columns located within configured fields\n"
f"given: {configured_fields}\n"
f"found: {self._header}"
)
20 changes: 17 additions & 3 deletions koza/model/config/source_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import Dict, List, Union

import yaml
from pydantic import StrictFloat, StrictInt, StrictStr
from pydantic import StrictFloat, StrictInt, StrictStr, StrictBool
from pydantic.dataclasses import dataclass

from koza.model.config.pydantic_config import PydanticConfig
Expand Down Expand Up @@ -111,6 +111,15 @@ class TransformMode(str, Enum):
loop = 'loop'


class HeaderMode(str, Enum):
"""
Enum for supported header modes in addition to an index based lookup
"""

infer = 'infer'
none = 'none'


@dataclass(frozen=True)
class ColumnFilter:
column: str
Expand Down Expand Up @@ -147,6 +156,12 @@ class SourceConfig:
TODO document fields
header: Optional, int|HeaderMode - the index (0 based) in which the
header appears in the file. If header is set to infer
the headers will be set to the first line that is not blank
or commented with a hash. If header is set to 'none'
then the columns field will be used
delimiter:
separator string similar to what works in str.split()
https://docs.python.org/3/library/stdtypes.html#str.split
Expand All @@ -163,8 +178,7 @@ class SourceConfig:
required_properties: List[str] = None
delimiter: str = None
header_delimiter: str = None
has_header: bool = True
skip_lines: int = 0
header: Union[int, HeaderMode] = HeaderMode.infer
skip_blank_lines: bool = True
compression: CompressionType = None
filters: List[ColumnFilter] = field(default_factory=list)
Expand Down
3 changes: 1 addition & 2 deletions koza/model/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ def __init__(
field_type_map=config.field_type_map,
delimiter=config.delimiter,
header_delimiter=config.header_delimiter,
has_header=config.has_header,
skip_lines=config.skip_lines,
header=config.header,
)
)
elif self.config.format == 'jsonl':
Expand Down
2 changes: 0 additions & 2 deletions tests/unit/resources/primary-source.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ header_delimiter: '\t'

compression: 'gzip'

skip_lines: 0

files:
- '9606.protein.links.detailed.v11.0.txt.gz'
- '10090.protein.links.detailed.v11.0.txt.gz'
Expand Down

0 comments on commit 7b20646

Please sign in to comment.