From b5ff0806cdfe9bd865bd78131620938deafd5c93 Mon Sep 17 00:00:00 2001
From: zeptofine <xpsychonauticonx@gmail.com>
Date: Fri, 7 Jul 2023 18:35:22 -0400
Subject: [PATCH] move prefix_via_database to manage_dataframe.ipynb

---
 manage_dataframe.ipynb | 81 +++++++++++++++++++++++++++++++++++++++---
 prefix_via_database.py | 38 --------------------
 2 files changed, 77 insertions(+), 42 deletions(-)
 delete mode 100644 prefix_via_database.py

diff --git a/manage_dataframe.ipynb b/manage_dataframe.ipynb
index 7d9bd5c..56d1251 100644
--- a/manage_dataframe.ipynb
+++ b/manage_dataframe.ipynb
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install matplotlib tqdm"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -7,17 +16,21 @@
    "outputs": [],
    "source": [
     "import random\n",
+    "from pathlib import Path\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
-    "import polars as pl"
+    "import polars as pl\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from dataset_filters.dataset_builder import DatasetBuilder\n",
+    "from dataset_filters import DataFilter"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Read dataframe\n"
+    "### Read dataframe\n"
    ]
   },
   {
@@ -64,7 +77,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "plot resolution vs modified time"
+    "### plot resolution vs modified time"
    ]
   },
   {
@@ -84,6 +97,66 @@
     "plt.xlabel(\"modifiedtime\")\n",
     "plt.ylabel(\"sum resolution\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Link a list of files based on data from the database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_folder = Path(\"/mnt/Toshiba/.Grabber/\")\n",
+    "output = input_folder.with_name(f\"{input_folder.name}-linked\")\n",
+    "category = \"hash\"\n",
+    "config_path = Path(\"database_config.toml\")\n",
+    "overwrite = True\n",
+    "\n",
+    "populate=False # you'll need to specify a filter to add to the builder if you use this\n",
+    "# define filters here\n",
+    "# from dataset_filters.external_filters import HashFilter\n",
+    "filter_list: list[DataFilter] = [\n",
+    "#     HashFilter()\n",
+    "    ]\n",
+    "# ^^ these filters do not change the output size. They only dictate what columns are available, \n",
+    "# if what you want is not already available."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run\n",
+    "output.mkdir(exist_ok=True)\n",
+    "exts = [\".jpg\", \".jpeg\", \".png\", \".webp\"]\n",
+    "filelist = [i for i in input_folder.rglob(\"*\") if i.suffix in exts]\n",
+    "db = DatasetBuilder(\"filedb.feather\", config_path)\n",
+    "##### add filters here ##### \n",
+    "if populate:\n",
+    "    db.add_filters(*filter_list)\n",
+    "    if db.filters:\n",
+    "        db.populate_df(filelist)\n",
+    "############################\n",
+    "assert category in db.df.columns, f\"selected category is not in {db.df.columns}\"\n",
+    "file_data = db.df.filter(pl.col(\"path\").is_in(list(map(str, filelist))))\n",
+    "\n",
+    "with tqdm(file_data.iter_rows(named=True), total=len(file_data)) as t:\n",
+    "    for data in t:\n",
+    "        pth = Path(data[\"path\"])\n",
+    "        hash_ = str(data[category])\n",
+    "        new_path: Path = (output / f\"{hash_}_{pth.stem}\").with_suffix(pth.suffix)\n",
+    "        if not new_path.exists() or overwrite:\n",
+    "            new_path.unlink()\n",
+    "            new_path.symlink_to(pth)\n",
+    "            t.set_description_str(hash_)\n"
+   ]
   }
  ],
  "metadata": {
diff --git a/prefix_via_database.py b/prefix_via_database.py
deleted file mode 100644
index cde1cc3..0000000
--- a/prefix_via_database.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import argparse
-from pathlib import Path
-
-import polars as pl
-
-from dataset_filters import DatasetBuilder, HashFilter
-from tqdm import tqdm
-
-
-def get_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("input", help="folder to scan")
-    parser.add_argument("category", help="the category from the database to sort by")
-    return parser
-
-
-if __name__ == "__main__":
-    args = get_parser().parse_args()
-
-    folder = Path(args.input)
-    new_folder = folder.parent / "linked"
-    new_folder.mkdir(exist_ok=True)
-
-    exts = [".jpg", ".jpeg", ".png", ".webp"]
-    filelist = list(filter(lambda i: i.suffix in exts, folder.rglob("*")))
-    db = DatasetBuilder("filedb.feather")
-    assert args.category in db.df.columns, f"selected category is not in {db.df.columns}"
-    db.add_filters(HashFilter())
-    db.populate_df(filelist)
-    file_data = db.df.filter(pl.col("path").is_in(list(map(str, filelist))))
-    with tqdm(file_data.iter_rows(named=True), total=len(file_data)) as t:
-        for data in t:
-            pth = Path(data["path"])
-            hash_ = str(data[args.category])
-            new_path: Path = (new_folder / f"{hash_}_{pth.stem}").with_suffix(pth.suffix)
-            if not new_path.exists():
-                new_path.symlink_to(pth)
-                t.set_description_str(hash_)