From b5ff0806cdfe9bd865bd78131620938deafd5c93 Mon Sep 17 00:00:00 2001 From: zeptofine Date: Fri, 7 Jul 2023 18:35:22 -0400 Subject: [PATCH] move prefix_via_database to manage_dataframe.ipynb --- manage_dataframe.ipynb | 81 +++++++++++++++++++++++++++++++++++++++--- prefix_via_database.py | 38 -------------------- 2 files changed, 77 insertions(+), 42 deletions(-) delete mode 100644 prefix_via_database.py diff --git a/manage_dataframe.ipynb b/manage_dataframe.ipynb index 7d9bd5c..56d1251 100644 --- a/manage_dataframe.ipynb +++ b/manage_dataframe.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install matplotlib tqdm" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,17 +16,21 @@ "outputs": [], "source": [ "import random\n", + "from pathlib import Path\n", "\n", "import matplotlib.pyplot as plt\n", - "import polars as pl" + "import polars as pl\n", + "from tqdm import tqdm\n", + "\n", + "from dataset_filters.dataset_builder import DatasetBuilder\n", + "from dataset_filters import DataFilter" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Read dataframe\n" + "### Read dataframe\n" ] }, { @@ -64,7 +77,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "plot resolution vs modified time" + "### plot resolution vs modified time" ] }, { @@ -84,6 +97,66 @@ "plt.xlabel(\"modifiedtime\")\n", "plt.ylabel(\"sum resolution\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Link a list of files based on data from the database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_folder = Path(\"/mnt/Toshiba/.Grabber/\")\n", + "output = input_folder.with_name(f\"{input_folder.name}-linked\")\n", + "category = \"hash\"\n", + "config_path = Path(\"database_config.toml\")\n", + "overwrite = True\n", + "\n", + "populate=False # you'll need to specify a filter to add to the builder if you use this\n", + "# define filters here\n", + "# from dataset_filters.external_filters import HashFilter\n", + "filter_list: list[DataFilter] = [\n", + "# HashFilter()\n", + " ]\n", + "# ^^ these filters do not change the output size. They only dictate what columns are available, \n", + "# if what you want is not already available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run\n", + "output.mkdir(exist_ok=True)\n", + "exts = [\".jpg\", \".jpeg\", \".png\", \".webp\"]\n", + "filelist = [i for i in input_folder.rglob(\"*\") if i.suffix in exts]\n", + "db = DatasetBuilder(\"filedb.feather\", config_path)\n", + "##### add filters here ##### \n", + "if populate:\n", + " db.add_filters(*filter_list)\n", + " if db.filters:\n", + " db.populate_df(filelist)\n", + "############################\n", + "assert category in db.df.columns, f\"selected category is not in {db.df.columns}\"\n", + "file_data = db.df.filter(pl.col(\"path\").is_in(list(map(str, filelist))))\n", + "\n", + "with tqdm(file_data.iter_rows(named=True), total=len(file_data)) as t:\n", + " for data in t:\n", + " pth = Path(data[\"path\"])\n", + " hash_ = str(data[category])\n", + " new_path: Path = (output / f\"{hash_}_{pth.stem}\").with_suffix(pth.suffix)\n", + " if not new_path.exists() or overwrite:\n", + " new_path.unlink()\n", + " new_path.symlink_to(pth)\n", + " t.set_description_str(hash_)\n" + ] } ], "metadata": { diff --git a/prefix_via_database.py b/prefix_via_database.py deleted file mode 100644 index cde1cc3..0000000 --- a/prefix_via_database.py +++ /dev/null @@ -1,38 +0,0 @@ -import argparse -from pathlib import Path - -import polars as pl - -from dataset_filters import DatasetBuilder, HashFilter -from tqdm import tqdm - - -def get_parser(): - parser = argparse.ArgumentParser() - parser.add_argument("input", help="folder to scan") - parser.add_argument("category", help="the category from the database to sort by") - return parser - - -if __name__ == "__main__": - args = get_parser().parse_args() - - folder = Path(args.input) - new_folder = folder.parent / "linked" - new_folder.mkdir(exist_ok=True) - - exts = [".jpg", ".jpeg", ".png", ".webp"] - filelist = list(filter(lambda i: i.suffix in exts, folder.rglob("*"))) - db = DatasetBuilder("filedb.feather") - assert args.category in db.df.columns, f"selected category is not in {db.df.columns}" - db.add_filters(HashFilter()) - db.populate_df(filelist) - file_data = db.df.filter(pl.col("path").is_in(list(map(str, filelist)))) - with tqdm(file_data.iter_rows(named=True), total=len(file_data)) as t: - for data in t: - pth = Path(data["path"]) - hash_ = str(data[args.category]) - new_path: Path = (new_folder / f"{hash_}_{pth.stem}").with_suffix(pth.suffix) - if not new_path.exists(): - new_path.symlink_to(pth) - t.set_description_str(hash_)