Config frame overhaul

zeptofine · Nov 1, 2023 · f6f8b50 · f6f8b50
1 parent 4148204
commit f6f8b50
Show file tree

Hide file tree

Showing 20 changed files with 1,647 additions and 1,549 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Module",
+            "type": "python",
+            "request": "launch",
+            "module": "imdataset_creator.gui",
+            "justMyCode": true
+        }
+    ]
+}
diff --git a/imdataset_creator/__init__.py b/imdataset_creator/__init__.py
@@ -2,5 +2,15 @@
 from .alphanumeric_sort import alphanumeric_sort
 from .config_handler import ConfigHandler
 from .configs import FilterData, MainConfig
-from .datarules import DatasetBuilder, ExprDict, File, Filter, Input, Output, Producer, Rule, chunk_split
+from .datarules import (
+    DatasetBuilder,
+    ExprDict,
+    File,
+    Filter,
+    Input,
+    Output,
+    Producer,
+    Rule,
+    chunk_split,
+)
 from .scenarios import FileScenario, OutputScenario
diff --git a/imdataset_creator/__main__.py b/imdataset_creator/__main__.py
@@ -25,19 +25,31 @@
 )
 
 CPU_COUNT = int(cpu_count())
-logging.basicConfig(level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
+logging.basicConfig(
+    level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
+)
 app = typer.Typer(pretty_exceptions_show_locals=True, pretty_exceptions_short=True)
 log = logging.getLogger()
 
 
 @app.command()
 def main(
-    config_path: Annotated[Path, Option(help="Where the dataset config is placed")] = Path("config.json"),
-    database_path: Annotated[Path, Option(help="Where the database is placed")] = Path("filedb.arrow"),
-    threads: Annotated[int, Option(help="multiprocessing threads")] = CPU_COUNT * 3 // 4,
+    config_path: Annotated[
+        Path, Option(help="Where the dataset config is placed")
+    ] = Path("config.json"),
+    database_path: Annotated[Path, Option(help="Where the database is placed")] = Path(
+        "filedb.arrow"
+    ),
+    threads: Annotated[int, Option(help="multiprocessing threads")] = CPU_COUNT
+    * 3
+    // 4,
     chunksize: Annotated[int, Option(help="imap chunksize")] = 5,
-    population_chunksize: Annotated[int, Option(help="chunksize when populating the df")] = 100,
-    population_interval: Annotated[int, Option(help="save interval in secs when populating the df")] = 60,
+    population_chunksize: Annotated[
+        int, Option(help="chunksize when populating the df")
+    ] = 100,
+    population_interval: Annotated[
+        int, Option(help="save interval in secs when populating the df")
+    ] = 60,
     simulate: Annotated[bool, Option(help="stops before conversion")] = False,
     verbose: Annotated[bool, Option(help="prints converted files")] = False,
     sort_by: Annotated[str, Option(help="Which database column to sort by")] = "path",
@@ -147,12 +159,16 @@ def main(
         files: list[File]
         if db_cfg.rules:
             filter_t = p.add_task("filtering", total=0)
-            files = [resolved[file] for file in db.filter(set(resolved)).get_column("path")]
+            files = [
+                resolved[file] for file in db.filter(set(resolved)).get_column("path")
+            ]
             p.update(filter_t, total=len(files), completed=len(files))
         else:
             files = list(resolved.values())
 
-        scenarios = list(db_cfg.parse_files(p.track(files, description="parsing files")))
+        scenarios = list(
+            db_cfg.parse_files(p.track(files, description="parsing files"))
+        )
         if len(scenarios) != len(files):
             p.log(f"{len(files) - len(scenarios)} files are completed")
 

diff --git a/imdataset_creator/config_handler.py b/imdataset_creator/config_handler.py
@@ -13,23 +13,34 @@
 class ConfigHandler:
     def __init__(self, cfg: MainConfig):
         # generate `Input`s
-        self.inputs: list[Input] = [Input.from_cfg(folder["data"]) for folder in cfg["inputs"]]
+        self.inputs: list[Input] = [
+            Input.from_cfg(folder["data"]) for folder in cfg["inputs"]
+        ]
         # generate `Output`s
-        self.outputs: list[Output] = [Output.from_cfg(folder["data"]) for folder in cfg["output"]]
+        self.outputs: list[Output] = [
+            Output.from_cfg(folder["data"]) for folder in cfg["output"]
+        ]
         # generate `Producer`s
         self.producers: list[Producer] = [
-            Producer.all_producers[p["name"]].from_cfg(p["data"]) for p in cfg["producers"]
+            Producer.all_producers[p["name"]].from_cfg(p["data"])
+            for p in cfg["producers"]
         ]
 
         # generate `Rule`s
-        self.rules: list[Rule] = [Rule.all_rules[r["name"]].from_cfg(r["data"]) for r in cfg["rules"]]
+        self.rules: list[Rule] = [
+            Rule.all_rules[r["name"]].from_cfg(r["data"]) for r in cfg["rules"]
+        ]
 
     @overload
-    def gather_images(self, sort=True, reverse=False) -> Generator[tuple[Path, list[Path]], None, None]:
+    def gather_images(
+        self, sort=True, reverse=False
+    ) -> Generator[tuple[Path, list[Path]], None, None]:
         ...
 
     @overload
-    def gather_images(self, sort=False, reverse=False) -> Generator[tuple[Path, PathGenerator], None, None]:
+    def gather_images(
+        self, sort=False, reverse=False
+    ) -> Generator[tuple[Path, PathGenerator], None, None]:
         ...
 
     def gather_images(
@@ -38,15 +49,21 @@ def gather_images(
         for input_ in self.inputs:
             gen = input_.run()
             if sort:
-                yield input_.folder, list(map(Path, sorted(map(str, gen), key=alphanumeric_sort, reverse=reverse)))
+                yield input_.folder, list(
+                    map(
+                        Path,
+                        sorted(map(str, gen), key=alphanumeric_sort, reverse=reverse),
+                    )
+                )
             else:
                 yield input_.folder, gen
 
     def get_outputs(self, file: File) -> list[OutputScenario]:
         return [
             OutputScenario(str(pth), output.filters)
             for output in self.outputs
-            if not (pth := output.folder / Path(output.format_file(file))).exists() or output.overwrite
+            if not (pth := output.folder / Path(output.format_file(file))).exists()
+            or output.overwrite
         ]
 
     def parse_files(self, files: Iterable[File]) -> Generator[FileScenario, None, None]:

diff --git a/imdataset_creator/datarules/dataset_builder.py b/imdataset_creator/datarules/dataset_builder.py
@@ -277,7 +277,8 @@ def get_unfinished_existing(self) -> LazyFrame:
     def filter(self, lst) -> DataFrame:  # noqa: A003
         if len(self.unready_rules):
             warnings.warn(
-                f"{len(self.unready_rules)} filters are not initialized and will not be populated", stacklevel=2
+                f"{len(self.unready_rules)} filters are not initialized and will not be populated",
+                stacklevel=2,
             )
 
         vdf: DataFrame = self.__df.filter(pl.col("path").is_in(lst))
@@ -330,6 +331,10 @@ def __repr__(self) -> str:
     def comply_to_schema(self, schema: SchemaDefinition) -> DataFrame:
         ...
 
+    @overload
+    def comply_to_schema(self, schema: SchemaDefinition, in_place=False) -> DataFrame:
+        ...
+
     @overload
     def comply_to_schema(self, schema: SchemaDefinition, in_place=True) -> None:
         ...
@@ -338,4 +343,5 @@ def comply_to_schema(self, schema: SchemaDefinition, in_place: bool = False) ->
         new_df: DataFrame = pl.concat((self.__df, DataFrame(schema=schema)), how="diagonal")
         if in_place:
             self.__df = new_df
+            return None
         return new_df
diff --git a/imdataset_creator/datarules/image_rules.py b/imdataset_creator/datarules/image_rules.py
@@ -2,7 +2,7 @@
 
 import os
 from collections.abc import Callable
-from enum import Enum
+from enum import Enum, StrEnum
 from functools import cache
 from types import MappingProxyType
 from typing import Literal, Self
@@ -129,7 +129,7 @@ def get_size(pth):
 }
 
 
-class HASHERS(str, Enum):
+class HASHERS(StrEnum):
     """
     Available hashers.
     """

diff --git a/imdataset_creator/enum_helpers.py b/imdataset_creator/enum_helpers.py
@@ -5,4 +5,4 @@
 
 
 def listostr2listoenum(lst: list[str], enum: type[T]) -> list[T]:
-    return [enum._member_map_[k] for k in lst]  # type: ignore
+    return [enum[k] for k in lst]  # type: ignore