From c5c7e3907134dce6172bab4b2af4e862c745ae26 Mon Sep 17 00:00:00 2001 From: Florian M Date: Mon, 22 Apr 2024 16:50:03 +0200 Subject: [PATCH 1/3] Script to find json mappings --- tools/mapping-format/find-json-mappings.py | 79 ++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tools/mapping-format/find-json-mappings.py diff --git a/tools/mapping-format/find-json-mappings.py b/tools/mapping-format/find-json-mappings.py new file mode 100644 index 00000000000..96da89ac8cc --- /dev/null +++ b/tools/mapping-format/find-json-mappings.py @@ -0,0 +1,79 @@ +from pathlib import Path +import argparse +import sys +import os + + +def format_bytes(num): + for unit in ("", "K", "M", "G", "T"): + if abs(num) < 1000.0: + return f"{int(num)}{unit}" + num /= 1000.0 + return f"{int(num)}P" + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("binary_data_dir", type=Path, help="WEBKNOSSOS binary data dir") + parser.add_argument( + "--all", + "-a", + action="store_true", + help="Print all files, not just old versions", + ) + parser.add_argument( + "--plain", + "-p", + action="store_true", + help="Print only the file names, not the version", + ) + args = parser.parse_args() + binary_data_dir = args.binary_data_dir + + if not args.plain: + print(f"Scanning {binary_data_dir} for json mapping files...\n", file=sys.stderr) + + seen = [] + + for orga_dir in [ + item for item in binary_data_dir.iterdir() if item.exists() and item.is_dir() + ]: + for dataset_dir in orga_dir.iterdir(): + try: + if dataset_dir.exists() and dataset_dir.is_dir(): + for layer_dir in [ + item + for item in dataset_dir.iterdir() + if item.exists() and item.is_dir() + ]: + mappings_dir = layer_dir.joinpath("mappings") + if mappings_dir.exists(): + for mapping_file in [ + item + for item in mappings_dir.iterdir() + if item.name.lower().endswith(".json") + ]: + realpath = mapping_file.resolve() + if realpath not in seen: + seen.append(realpath) + size = os.stat(realpath).st_size + print( + f"{format_bytes(size)} {mapping_file}" + ) + except Exception as e: + if not args.plain: + print( + f"Exception while scanning dataset dir at {dataset_dir}: {e}", + file=sys.stderr, + ) + + if not args.plain: + print( + f"\nDone scanning {binary_data_dir}, listed {len(seen)} json mappings.", + file=sys.stderr, + ) + + +if __name__ == "__main__": + main() From ea44268a36651743c8146380a32b5e12442a8593 Mon Sep 17 00:00:00 2001 From: Florian M Date: Mon, 22 Apr 2024 17:20:23 +0200 Subject: [PATCH 2/3] skip dot dirs --- tools/mapping-format/find-json-mappings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mapping-format/find-json-mappings.py b/tools/mapping-format/find-json-mappings.py index 96da89ac8cc..0bd2b1d73e0 100644 --- a/tools/mapping-format/find-json-mappings.py +++ b/tools/mapping-format/find-json-mappings.py @@ -37,7 +37,7 @@ def main(): seen = [] for orga_dir in [ - item for item in binary_data_dir.iterdir() if item.exists() and item.is_dir() + item for item in binary_data_dir.iterdir() if item.exists() and item.is_dir() and not item.name.startswith(".") ]: for dataset_dir in orga_dir.iterdir(): try: From 1d97356a4975504f2d7f45579b7a9a8f1e5139c6 Mon Sep 17 00:00:00 2001 From: Florian M Date: Wed, 11 Sep 2024 16:32:32 +0200 Subject: [PATCH 3/3] add explaining comment --- tools/mapping-format/find-json-mappings.py | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tools/mapping-format/find-json-mappings.py b/tools/mapping-format/find-json-mappings.py index 0bd2b1d73e0..7d16508996a 100644 --- a/tools/mapping-format/find-json-mappings.py +++ b/tools/mapping-format/find-json-mappings.py @@ -3,30 +3,22 @@ import sys import os - -def format_bytes(num): - for unit in ("", "K", "M", "G", "T"): - if abs(num) < 1000.0: - return f"{int(num)}{unit}" - num /= 1000.0 - return f"{int(num)}P" - +# This script searches a binaryData folder for json mapping files. +# We had hoped to convert all json files to hdf5, but this got deprioritized +# because it turned out each oversegment group gets mapped to one of the ids of the group +# agglomerate files on the other hand, are optimized for continuous agglomerate ids +# This means to preserve the ids of some json mappings, huge and hugely inefficient +# agglomerate files would be needed. def main(): parser = argparse.ArgumentParser() parser.add_argument("binary_data_dir", type=Path, help="WEBKNOSSOS binary data dir") - parser.add_argument( - "--all", - "-a", - action="store_true", - help="Print all files, not just old versions", - ) parser.add_argument( "--plain", "-p", action="store_true", - help="Print only the file names, not the version", + help="Print only the file names, not info output", ) args = parser.parse_args() binary_data_dir = args.binary_data_dir @@ -75,5 +67,13 @@ def main(): ) +def format_bytes(num): + for unit in ("", "K", "M", "G", "T"): + if abs(num) < 1000.0: + return f"{int(num)}{unit}" + num /= 1000.0 + return f"{int(num)}P" + + if __name__ == "__main__": main()