Skip to content

Commit

Permalink
Config frame overhaul
Browse files Browse the repository at this point in the history
  • Loading branch information
zeptofine committed Nov 1, 2023
1 parent 4148204 commit f6f8b50
Show file tree
Hide file tree
Showing 20 changed files with 1,647 additions and 1,549 deletions.
15 changes: 15 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Module",
"type": "python",
"request": "launch",
"module": "imdataset_creator.gui",
"justMyCode": true
}
]
}
12 changes: 11 additions & 1 deletion imdataset_creator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,15 @@
from .alphanumeric_sort import alphanumeric_sort
from .config_handler import ConfigHandler
from .configs import FilterData, MainConfig
from .datarules import DatasetBuilder, ExprDict, File, Filter, Input, Output, Producer, Rule, chunk_split
from .datarules import (
DatasetBuilder,
ExprDict,
File,
Filter,
Input,
Output,
Producer,
Rule,
chunk_split,
)
from .scenarios import FileScenario, OutputScenario
32 changes: 24 additions & 8 deletions imdataset_creator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,31 @@
)

CPU_COUNT = int(cpu_count())
logging.basicConfig(level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()])
logging.basicConfig(
level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
)
app = typer.Typer(pretty_exceptions_show_locals=True, pretty_exceptions_short=True)
log = logging.getLogger()


@app.command()
def main(
config_path: Annotated[Path, Option(help="Where the dataset config is placed")] = Path("config.json"),
database_path: Annotated[Path, Option(help="Where the database is placed")] = Path("filedb.arrow"),
threads: Annotated[int, Option(help="multiprocessing threads")] = CPU_COUNT * 3 // 4,
config_path: Annotated[
Path, Option(help="Where the dataset config is placed")
] = Path("config.json"),
database_path: Annotated[Path, Option(help="Where the database is placed")] = Path(
"filedb.arrow"
),
threads: Annotated[int, Option(help="multiprocessing threads")] = CPU_COUNT
* 3
// 4,
chunksize: Annotated[int, Option(help="imap chunksize")] = 5,
population_chunksize: Annotated[int, Option(help="chunksize when populating the df")] = 100,
population_interval: Annotated[int, Option(help="save interval in secs when populating the df")] = 60,
population_chunksize: Annotated[
int, Option(help="chunksize when populating the df")
] = 100,
population_interval: Annotated[
int, Option(help="save interval in secs when populating the df")
] = 60,
simulate: Annotated[bool, Option(help="stops before conversion")] = False,
verbose: Annotated[bool, Option(help="prints converted files")] = False,
sort_by: Annotated[str, Option(help="Which database column to sort by")] = "path",
Expand Down Expand Up @@ -147,12 +159,16 @@ def main(
files: list[File]
if db_cfg.rules:
filter_t = p.add_task("filtering", total=0)
files = [resolved[file] for file in db.filter(set(resolved)).get_column("path")]
files = [
resolved[file] for file in db.filter(set(resolved)).get_column("path")
]
p.update(filter_t, total=len(files), completed=len(files))
else:
files = list(resolved.values())

scenarios = list(db_cfg.parse_files(p.track(files, description="parsing files")))
scenarios = list(
db_cfg.parse_files(p.track(files, description="parsing files"))
)
if len(scenarios) != len(files):
p.log(f"{len(files) - len(scenarios)} files are completed")

Expand Down
33 changes: 25 additions & 8 deletions imdataset_creator/config_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,34 @@
class ConfigHandler:
def __init__(self, cfg: MainConfig):
# generate `Input`s
self.inputs: list[Input] = [Input.from_cfg(folder["data"]) for folder in cfg["inputs"]]
self.inputs: list[Input] = [
Input.from_cfg(folder["data"]) for folder in cfg["inputs"]
]
# generate `Output`s
self.outputs: list[Output] = [Output.from_cfg(folder["data"]) for folder in cfg["output"]]
self.outputs: list[Output] = [
Output.from_cfg(folder["data"]) for folder in cfg["output"]
]
# generate `Producer`s
self.producers: list[Producer] = [
Producer.all_producers[p["name"]].from_cfg(p["data"]) for p in cfg["producers"]
Producer.all_producers[p["name"]].from_cfg(p["data"])
for p in cfg["producers"]
]

# generate `Rule`s
self.rules: list[Rule] = [Rule.all_rules[r["name"]].from_cfg(r["data"]) for r in cfg["rules"]]
self.rules: list[Rule] = [
Rule.all_rules[r["name"]].from_cfg(r["data"]) for r in cfg["rules"]
]

@overload
def gather_images(self, sort=True, reverse=False) -> Generator[tuple[Path, list[Path]], None, None]:
def gather_images(
self, sort=True, reverse=False
) -> Generator[tuple[Path, list[Path]], None, None]:
...

@overload
def gather_images(self, sort=False, reverse=False) -> Generator[tuple[Path, PathGenerator], None, None]:
def gather_images(
self, sort=False, reverse=False
) -> Generator[tuple[Path, PathGenerator], None, None]:
...

def gather_images(
Expand All @@ -38,15 +49,21 @@ def gather_images(
for input_ in self.inputs:
gen = input_.run()
if sort:
yield input_.folder, list(map(Path, sorted(map(str, gen), key=alphanumeric_sort, reverse=reverse)))
yield input_.folder, list(
map(
Path,
sorted(map(str, gen), key=alphanumeric_sort, reverse=reverse),
)
)
else:
yield input_.folder, gen

def get_outputs(self, file: File) -> list[OutputScenario]:
return [
OutputScenario(str(pth), output.filters)
for output in self.outputs
if not (pth := output.folder / Path(output.format_file(file))).exists() or output.overwrite
if not (pth := output.folder / Path(output.format_file(file))).exists()
or output.overwrite
]

def parse_files(self, files: Iterable[File]) -> Generator[FileScenario, None, None]:
Expand Down
8 changes: 7 additions & 1 deletion imdataset_creator/datarules/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,8 @@ def get_unfinished_existing(self) -> LazyFrame:
def filter(self, lst) -> DataFrame: # noqa: A003
if len(self.unready_rules):
warnings.warn(
f"{len(self.unready_rules)} filters are not initialized and will not be populated", stacklevel=2
f"{len(self.unready_rules)} filters are not initialized and will not be populated",
stacklevel=2,
)

vdf: DataFrame = self.__df.filter(pl.col("path").is_in(lst))
Expand Down Expand Up @@ -330,6 +331,10 @@ def __repr__(self) -> str:
def comply_to_schema(self, schema: SchemaDefinition) -> DataFrame:
...

@overload
def comply_to_schema(self, schema: SchemaDefinition, in_place=False) -> DataFrame:
...

@overload
def comply_to_schema(self, schema: SchemaDefinition, in_place=True) -> None:
...
Expand All @@ -338,4 +343,5 @@ def comply_to_schema(self, schema: SchemaDefinition, in_place: bool = False) ->
new_df: DataFrame = pl.concat((self.__df, DataFrame(schema=schema)), how="diagonal")
if in_place:
self.__df = new_df
return None
return new_df
4 changes: 2 additions & 2 deletions imdataset_creator/datarules/image_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
from collections.abc import Callable
from enum import Enum
from enum import Enum, StrEnum
from functools import cache
from types import MappingProxyType
from typing import Literal, Self
Expand Down Expand Up @@ -129,7 +129,7 @@ def get_size(pth):
}


class HASHERS(str, Enum):
class HASHERS(StrEnum):
"""
Available hashers.
"""
Expand Down
2 changes: 1 addition & 1 deletion imdataset_creator/enum_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@


def listostr2listoenum(lst: list[str], enum: type[T]) -> list[T]:
return [enum._member_map_[k] for k in lst] # type: ignore
return [enum[k] for k in lst] # type: ignore
Loading

0 comments on commit f6f8b50

Please sign in to comment.