Skip to content

Commit

Permalink
move prefix_via_database to manage_dataframe.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
zeptofine committed Jul 7, 2023
1 parent 83fa899 commit b5ff080
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 42 deletions.
81 changes: 77 additions & 4 deletions manage_dataframe.ipynb
Original file line number Diff line number Diff line change
@@ -1,23 +1,36 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install matplotlib tqdm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"from pathlib import Path\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import polars as pl"
"import polars as pl\n",
"from tqdm import tqdm\n",
"\n",
"from dataset_filters.dataset_builder import DatasetBuilder\n",
"from dataset_filters import DataFilter"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Read dataframe\n"
"### Read dataframe\n"
]
},
{
Expand Down Expand Up @@ -64,7 +77,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"plot resolution vs modified time"
"### plot resolution vs modified time"
]
},
{
Expand All @@ -84,6 +97,66 @@
"plt.xlabel(\"modifiedtime\")\n",
"plt.ylabel(\"sum resolution\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Link a list of files based on data from the database"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"input_folder = Path(\"/mnt/Toshiba/.Grabber/\")\n",
"output = input_folder.with_name(f\"{input_folder.name}-linked\")\n",
"category = \"hash\"\n",
"config_path = Path(\"database_config.toml\")\n",
"overwrite = True\n",
"\n",
"populate=False # you'll need to specify a filter to add to the builder if you use this\n",
"# define filters here\n",
"# from dataset_filters.external_filters import HashFilter\n",
"filter_list: list[DataFilter] = [\n",
"# HashFilter()\n",
" ]\n",
"# ^^ these filters do not change the output size. They only dictate what columns are available, \n",
"# if what you want is not already available."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Run\n",
"output.mkdir(exist_ok=True)\n",
"exts = [\".jpg\", \".jpeg\", \".png\", \".webp\"]\n",
"filelist = [i for i in input_folder.rglob(\"*\") if i.suffix in exts]\n",
"db = DatasetBuilder(\"filedb.feather\", config_path)\n",
"##### add filters here ##### \n",
"if populate:\n",
" db.add_filters(*filter_list)\n",
" if db.filters:\n",
" db.populate_df(filelist)\n",
"############################\n",
"assert category in db.df.columns, f\"selected category is not in {db.df.columns}\"\n",
"file_data = db.df.filter(pl.col(\"path\").is_in(list(map(str, filelist))))\n",
"\n",
"with tqdm(file_data.iter_rows(named=True), total=len(file_data)) as t:\n",
" for data in t:\n",
" pth = Path(data[\"path\"])\n",
" hash_ = str(data[category])\n",
" new_path: Path = (output / f\"{hash_}_{pth.stem}\").with_suffix(pth.suffix)\n",
" if not new_path.exists() or overwrite:\n",
" new_path.unlink()\n",
" new_path.symlink_to(pth)\n",
" t.set_description_str(hash_)\n"
]
}
],
"metadata": {
Expand Down
38 changes: 0 additions & 38 deletions prefix_via_database.py

This file was deleted.

0 comments on commit b5ff080

Please sign in to comment.