Skip to content

Commit

Permalink
Merge pull request #116 from monarch-initiative/zip
Browse files Browse the repository at this point in the history
Implement Zip decompression
  • Loading branch information
glass-ships authored Nov 22, 2023
2 parents eb4a332 + 3234fc5 commit 20ccd0a
Show file tree
Hide file tree
Showing 22 changed files with 991 additions and 30,975 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ koza validate \
```bash
koza validate \
--file ./examples/data/ddpheno.json.gz \
--format json \
--compression gzip
--format json
```

#### Transform
Expand Down
6 changes: 3 additions & 3 deletions docs/Usage/ingests.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@ Creating this ingest will require three things:

This YAML file sets properties for the ingest of a single file type from a within a Source.

???+ tip "Paths are relative to the directory from which you execute Koza."
!!! tip "Paths are relative to the directory from which you execute Koza."

| __Required properties__ | |
| --- | --- |
| `name` | Name of the source |
| `files` | List of files to process |
|||
| __Optional properties__ | |
| `file_archive` | Path to a file archive containing the files to process |
| `file_archive` | Path to a file archive containing the file(s) to process <br/> Supported archive formats: zip, gzip |
| `format` | Format of the data file(s) (CSV or JSON) |
| `sssom_config` | Configures usage of SSSOM mapping files |
| `depends_on` | List of map config files to use |
Expand Down Expand Up @@ -165,7 +165,7 @@ Creating this ingest will require three things:
```python
# other imports, eg. uuid, pydantic, etc.
import uuid
from biolink.pydanticmodel import Gene, PairwiseGeneToGeneInteraction
from biolink.pydanticmodel_v2 import Gene, PairwiseGeneToGeneInteraction
# Koza imports
from koza.cli_runner import get_koza_app
Expand Down
3 changes: 1 addition & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ Koza also includes some examples to help you get started (see `koza/examples`).
```bash
koza validate \
--file ./examples/data/ddpheno.json.gz \
--format json \
--compression gzip
--format json
```

!!! list "Transform"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re
import uuid

# from biolink.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Protein
from koza.model.biolink.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Protein
from biolink.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Protein

from koza.cli_runner import get_koza_app

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import uuid

# from biolink.pydanticmodel_v2 import Gene, PairwiseGeneToGeneInteraction
from koza.model.biolink.pydanticmodel_v2 import Gene, PairwiseGeneToGeneInteraction
from biolink.pydanticmodel_v2 import Gene, PairwiseGeneToGeneInteraction

from koza.cli_runner import get_koza_app

Expand Down
3 changes: 1 addition & 2 deletions examples/string-w-map/map-protein-links-detailed.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import uuid

# from biolink.pydanticmodel_v2 import Gene, PairwiseGeneToGeneInteraction
from koza.model.biolink.pydanticmodel_v2 import Gene, PairwiseGeneToGeneInteraction
from biolink.pydanticmodel_v2 import Gene, PairwiseGeneToGeneInteraction

from koza.cli_runner import get_koza_app

Expand Down
3 changes: 1 addition & 2 deletions examples/string/protein-links-detailed.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re
import uuid

# from biolink.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Protein
from koza.model.biolink.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Protein
from biolink.pydanticmodel_v2 import PairwiseGeneToGeneInteraction, Protein

from koza.cli_runner import get_koza_app

Expand Down
1,377 changes: 735 additions & 642 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "koza"
version = "0.5.0"
version = "0.5.1"
description = "Data transformation framework for LinkML data models"
authors = [
"The Monarch Initiative <[email protected]>",
Expand All @@ -15,8 +15,8 @@ packages = [
]

[tool.poetry.dependencies]
python = "^3.8"
linkml = ">=1.6.2"
python = "^3.9"
linkml = ">=1.6.3"
pydantic = "^2.4"
pyyaml = ">=5.0.0"
requests = "^2.24.0"
Expand All @@ -30,7 +30,7 @@ sssom = "^0.3.41"
black = "^23.10.0"
ruff = "*"
pytest = ">=6.0.0"
biolink-model = ">=3.0.1" # ">=3.5.5"
biolink-model = { git = "https://github.com/glass-ships/biolink-model.git", branch = "dependencies" } # ">=3.5.5"
dask = ">=2022.5.2"
mkdocs = ">=1.4.2"
mkdocs-material = ">=9.1.16"
Expand Down
22 changes: 15 additions & 7 deletions src/koza/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Dict, Union
import yaml

# from linkml_validator.validator import Validator
from linkml.validator import validate
from pydantic.error_wrappers import ValidationError

from koza.converter.kgx_converter import KGXConverter
Expand All @@ -30,6 +30,8 @@ def __init__(
output_dir: str = './output',
output_format: OutputFormat = OutputFormat('jsonl'),
schema: str = None,
node_type: str = None,
edge_type: str = None,
logger=None,
):
self.source = source
Expand All @@ -43,8 +45,13 @@ def __init__(
self.logger = logger

if schema:
# self.validator = Validator(schema=schema)
# self.validate = True
# self.schema = schema
# self.node_type = node_type
# self.edge_type = edge_type
self.converter = KGXConverter()
else:
self.validate = False

if source.config.depends_on is not None:
for map_file in source.config.depends_on:
Expand Down Expand Up @@ -138,22 +145,23 @@ def next_row():

def write(self, *entities):
# If a schema/validator is defined, validate before writing
if hasattr(self, 'validator'):
# if self.validate:
if hasattr(self, 'schema'):
(nodes, edges) = self.converter.convert(entities)
if self.output_format == OutputFormat.tsv:
if nodes:
for node in nodes:
self.validator.validate(obj=node, target_class="NamedThing", strict=True)
validate(instance=node, target_class=self.node_type, schema=self.schema, strict=True)
if edges:
for edge in edges:
self.validator.validate(obj=edge, target_class="Association", strict=True)
validate(instance=edge, target_class=self.edge_type, schema=self.schema, strict=True)
elif self.output_format == OutputFormat.jsonl:
if nodes:
for node in nodes:
self.validator.validate(obj=node, target_class="NamedThing", strict=True)
validate(instance=node, target_class=self.node_type, schema=self.schema, strict=True)
if edges:
for edge in edges:
self.validator.validate(obj=edge, target_class="Association", strict=True)
validate(instance=edge, target_class=self.edge_type, schema=self.schema, strict=True)

self.writer.write(entities)

Expand Down
8 changes: 6 additions & 2 deletions src/koza/cli_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def transform_source(
global_table: str = None,
local_table: str = None,
schema: str = None,
node_type: str = None,
edge_type: str = None,
row_limit: int = None,
verbose: bool = None,
log: bool = False,
Expand Down Expand Up @@ -77,7 +79,7 @@ def transform_source(
logger,
)

koza_app = _set_koza_app(koza_source, translation_table, output_dir, output_format, schema, logger)
koza_app = _set_koza_app(koza_source, translation_table, output_dir, output_format, schema, node_type, edge_type, logger)
koza_app.process_maps()
koza_app.process_sources()

Expand Down Expand Up @@ -164,11 +166,13 @@ def _set_koza_app(
output_dir: str = './output',
output_format: OutputFormat = OutputFormat('tsv'),
schema: str = None,
node_type: str = None,
edge_type: str = None,
logger=None,
) -> KozaApp:
"""Create a KozaApp object for a given source"""

koza_apps[source.config.name] = KozaApp(source, translation_table, output_dir, output_format, schema, logger)
koza_apps[source.config.name] = KozaApp(source, translation_table, output_dir, output_format, schema, node_type, edge_type, logger)
logger.debug(f"koza_apps entry created for {source.config.name}: {koza_apps[source.config.name]}")
return koza_apps[source.config.name]

Expand Down
2 changes: 1 addition & 1 deletion src/koza/converter/biolink_converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from biolink.pydanticmodel import Gene
from biolink.pydanticmodel_v2 import Gene

from koza.cli_runner import koza_app

Expand Down
37 changes: 23 additions & 14 deletions src/koza/io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from os import PathLike
from pathlib import Path
from typing import IO, Any, Dict, Union
from zipfile import ZipFile, is_zipfile

import requests

Expand All @@ -34,35 +35,43 @@ def open_resource(resource: Union[str, PathLike]) -> IO[str]:
:return: str, next line in resource
"""
if Path(resource).exists():
# Try gzip first
try:
file = gzip.open(resource, 'rt')
file.read(1)
file.seek(0)
except OSError:
file = open(resource, 'r')
return file

elif isinstance(resource, str) and resource.startswith('http'):
# Check if resource is a remote file
if isinstance(resource, str) and resource.startswith('http'):
tmp_file = tempfile.TemporaryFile('w+b')
request = requests.get(resource)
if request.status_code != 200:
raise ValueError(f"Remote file returned {request.status_code}: {request.text}")
tmp_file.write(request.content)
request.close() # not sure this is needed
# request.close() # not sure this is needed
tmp_file.seek(0)
if resource.endswith('gz'):
# This should be more robust, either check headers
# or use https://github.com/ahupp/python-magic
remote_file = gzip.open(tmp_file, 'rt')
return remote_file

else:
return TextIOWrapper(tmp_file)

# If resource is not remote or local, raise error
elif not Path(resource).exists():
raise ValueError(
f"Cannot open local or remote file: {resource}. Check the URL/path, and that the file exists, and try again."
)

# If resource is local, check for compression
if is_zipfile(resource):
with ZipFile(resource, 'r') as zip_file:
file = TextIOWrapper(zip_file.open(zip_file.namelist()[0], 'r'))#, encoding='utf-8')
# file = zip_file.read(zip_file.namelist()[0], 'r').decode('utf-8')
elif str(resource).endswith('gz'):
file = gzip.open(resource, 'rt')
file.read(1)
file.seek(0)

# If resource is local and not compressed, open as text
else:
raise ValueError(f"Cannot open local or remote file: {resource}")
file = open(resource, 'r')
return file


def check_data(entry, path) -> bool:
Expand Down
Loading

0 comments on commit 20ccd0a

Please sign in to comment.