Skip to content

Commit

Permalink
allow computing file attributes in python, and test most of the script
Browse files Browse the repository at this point in the history
  • Loading branch information
leoschwarz committed Jun 27, 2024
1 parent 0d1bfc5 commit afab4ea
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 41 deletions.
94 changes: 53 additions & 41 deletions bfabric/scripts/bfabric_save_importresource_sample.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
# TODO add integration test (with and without sample id)
"""General Importresource Feeder for bfabric
Author:
Expand All @@ -15,23 +14,25 @@
"""
from __future__ import annotations

import logging
import logging.handlers
import hashlib
import json
import os
import re
import sys
import time
import json
from pathlib import Path

from bfabric import Bfabric
from loguru import logger

from bfabric import Bfabric, BfabricConfig

BFABRIC_STORAGE_ID = 2


def save_importresource(client: Bfabric, line: str) -> None:
"""reads, splits and submit the input line to the bfabric system
Input: a line containg
md5sum;date;size;path
md5sum;timestamp;size;path
"906acd3541f056e0f6d6073a4e528570;
1345834449;
Expand All @@ -41,19 +42,52 @@ def save_importresource(client: Bfabric, line: str) -> None:
Output:
True on success otherwise an exception raise
"""
md5_checksum, file_date, file_size, file_path = line.split(";")
md5_checksum, file_unix_timestamp, file_size, file_path = get_file_attributes(line)
obj = create_importresource_dict(
config=client.config,
file_path=file_path,
file_size=file_size,
file_unix_timestamp=file_unix_timestamp,
md5_checksum=md5_checksum,
)
logger.info(obj)
res = client.save(endpoint="importresource", obj=obj)
print(json.dumps(res, indent=2))

# Format the timestamp for bfabric
file_date = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(int(file_date)))

bfabric_application_ids = client.config.application_ids
def get_file_attributes(file_name_or_attributes: str) -> tuple[str, int, int, str]:
"""Returns (md5sum, timestamp, size, path) from a given input line, which can either contain
the precomputed values (separated by ";") or just a filename."""
values = file_name_or_attributes.split(";")
if len(values) == 4:
return values[0], int(values[1]), int(values[2]), values[3]
elif len(values) == 1:
filename = values[0].strip()
file_path = Path("/srv/www/htdocs") / filename
file_stat = file_path.stat()
file_size = file_stat.st_size
file_unix_timestamp = int(file_stat.st_mtime)
hash = hashlib.md5()
with file_path.open("rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash.update(chunk)
md5_checksum = hash.hexdigest()
return md5_checksum, file_unix_timestamp, file_size, filename
else:
raise ValueError("Invalid input line format")


def create_importresource_dict(
config: BfabricConfig, file_path: str, file_size: int, file_unix_timestamp: int, md5_checksum
) -> dict[str, str | int]:
# Format the timestamp for bfabric
file_date = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(file_unix_timestamp))
bfabric_application_ids = config.application_ids
if not bfabric_application_ids:
raise RuntimeError("No bfabric_application_ids configured. check '~/.bfabricpy.yml' file!")

bfabric_application_id, bfabric_projectid = get_bfabric_application_and_project_id(
bfabric_application_ids, file_path
)

obj = {
"applicationid": bfabric_application_id,
"filechecksum": md5_checksum,
Expand All @@ -64,18 +98,10 @@ def save_importresource(client: Bfabric, line: str) -> None:
"size": file_size,
"storageid": BFABRIC_STORAGE_ID,
}

match = re.search(
r"p([0-9]+)\/(Proteomics\/[A-Z]+_[1-9])\/.*_\d\d\d_S([0-9][0-9][0-9][0-9][0-9][0-9]+)_.*(raw|zip)$",
file_path,
)
if match:
print(f"found sampleid={match.group(3)} pattern")
obj["sampleid"] = int(match.group(3))

print(obj)
res = client.save(endpoint="importresource", obj=obj)
print(json.dumps(res, indent=2))
sample_id = get_sample_id_from_path(file_path)
if sample_id is not None:
obj["sampleid"] = sample_id
return obj


def get_sample_id_from_path(file_path: str) -> int | None:
Expand All @@ -85,10 +111,8 @@ def get_sample_id_from_path(file_path: str) -> int | None:
file_path,
)
if match:
print(f"found sampleid={match.group(3)} pattern")
logger.info(f"found sampleid={match.group(3)} pattern")
return int(match.group(3))
else:
return None


def get_bfabric_application_and_project_id(bfabric_application_ids: dict[str, int], file_path: str) -> tuple[int, int]:
Expand All @@ -101,31 +125,19 @@ def get_bfabric_application_and_project_id(bfabric_application_ids: dict[str, in
if re.search(i, file_path):
bfabric_applicationid = bfabric_application_ids[i]
re_result = re.search(r"^p([0-9]+)\/.+", file_path)
bfabric_projectid = re_result.group(1)
bfabric_projectid = int(re_result.group(1))
break
if bfabric_applicationid < 0:
logger = logging.getLogger("sync_feeder")
logger.error(f"{file_path}; no bfabric application id.")
raise RuntimeError("no bfabric application id.")
return bfabric_applicationid, bfabric_projectid


def setup_logger() -> None:
"""Sets up a logger for the script."""
logger = logging.getLogger("sync_feeder")
hdlr_syslog = logging.handlers.SysLogHandler(address=("130.60.81.21", 514))
formatter = logging.Formatter("%(name)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
hdlr_syslog.setFormatter(formatter)
logger.addHandler(hdlr_syslog)
logger.setLevel(logging.INFO)


def main() -> None:
"""Parses arguments and calls `save_importresource`."""
setup_logger()
client = Bfabric.from_config(verbose=True)
if sys.argv[1] == "-":
print("reading from stdin ...")
logger.info("reading from stdin ...")
for input_line in sys.stdin:
save_importresource(client, input_line.rstrip())
elif sys.argv[1] == "-h":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from pathlib import Path
from unittest.mock import ANY

import pytest
from pytest_mock import MockerFixture

from bfabric.scripts.bfabric_save_importresource_sample import (
get_sample_id_from_path,
get_bfabric_application_and_project_id,
create_importresource_dict,
get_file_attributes,
)


def test_get_sample_id_from_path_when_match():
example_path = "p123/Proteomics/A_1/abc_002_S123456_hello.zip"
assert get_sample_id_from_path(example_path) == 123456


def test_get_sample_id_from_path_when_none():
example_path = "p123/Proteomics/A_1/abc_002_S123456_hello.txt"
assert get_sample_id_from_path(example_path) is None


def test_get_bfabric_application_and_project_id_when_match():
bfabric_application_ids = {"bestapp": 500}
example_path = "p123/Proteomics/bestapp/abc_002_S123456_hello.zip"
assert get_bfabric_application_and_project_id(bfabric_application_ids, example_path) == (500, 123)


def test_get_bfabric_application_and_project_id_when_no_match():
bfabric_application_ids = {"bestapp": 500}
example_path = "p123/Proteomics/otherapp/abc_002_S123456_hello.zip"
with pytest.raises(RuntimeError):
get_bfabric_application_and_project_id(bfabric_application_ids, example_path)


@pytest.mark.parametrize("sample_id", [123456, None])
def test_create_importresource_dict(mocker: MockerFixture, sample_id: int | None):
file_unix_timestamp = 123000
if sample_id:
file_path = f"p123/Proteomics/A_1/abc_002_S{sample_id}_hello.zip"
else:
file_path = "p123/Proteomics/A_1/abc_002_123456_hello.txt"
mock_config = mocker.MagicMock(name="config", application_ids={"bestapp": 500})
mock_get_application_and_project_id = mocker.patch(
"bfabric.scripts.bfabric_save_importresource_sample.get_bfabric_application_and_project_id"
)
mock_get_application_and_project_id.return_value = (500, 123)

obj = create_importresource_dict(
config=mock_config,
file_path=file_path,
file_size=123,
file_unix_timestamp=file_unix_timestamp,
md5_checksum="123",
)

expected = {
"applicationid": 500,
"filechecksum": "123",
"containerid": 123,
"filedate": "1970-01-02 10:10:00",
"relativepath": file_path,
"name": Path(file_path).name,
"size": 123,
"storageid": 2,
}
if sample_id is not None:
expected["sampleid"] = sample_id

assert obj == expected


def test_get_file_attributes_when_parsed() -> None:
attributes = "abcdef123456;1000;50000;my/file.txt"
assert get_file_attributes(attributes) == ("abcdef123456", 1000, 50000, "my/file.txt")


def test_get_file_attributes_when_filename(tmp_path: Path) -> None:
tmp_file = tmp_path / "my_file.txt"
tmp_file.write_text("hello")
expected_hash = "5d41402abc4b2a76b9719d911017c592"
assert get_file_attributes(str(tmp_file)) == (expected_hash, ANY, 5, str(tmp_file))


def test_get_file_attributes_when_invalid() -> None:
attributes = "abcdef123;1000;50000;my;file.txt"
with pytest.raises(ValueError):
get_file_attributes(attributes)


if __name__ == "__main__":
pytest.main()

0 comments on commit afab4ea

Please sign in to comment.