qdirstat-generate-cache.py

#!/usr/bin/env python3
import gzip
import argparse
from os import stat_result
from stat import S_ISREG, S_ISLNK, S_ISDIR, S_ISBLK, S_ISCHR, S_ISFIFO, S_ISSOCK

from typing import TextIO
from pathlib import Path
from urllib.parse import quote

DESCRIPTION = """
QDirStat can read information about disk usage from cache files instead of
looking at a live file system. This allows you to easily browse disk usage
on servers and other systems where QDirStat and a GUI environment cannot be
installed on directly. This script generates cache files which you can open
in QDirStat -> "File" -> "Read Cache File...".

This is a Python rewrite of a similar Perl script, qdirstat-cache-writer.
It depends only on a standard Python 3.6+ installation, available on most
modern Linux distributions out-of-the-box, no extra deps needed. It also
does gzip compression on-the-fly instead of compressing the final file,
but can't generate the long format.
"""

CACHE_HEADER = """
[qdirstat 2.0 cache file]
# Automatically generated by qdirstat-generate-cache.py - do not edit
# type path size uid gid mode mtime <optional fields>
"""


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=DESCRIPTION)
    parser.add_argument(
        "path",
        help="Path from which to start collecting disk usage data.",
    )
    parser.add_argument(
        "--cross-mounts",
        action="store_true",
        help="Collect data about files under other mountpoints.",
    )
    parser.add_argument(
        "-o",
        "--output",
        default="qdirstat.cache.gz",
        help="Name of the file to write the cache to.",
    )
    return parser.parse_args()


def get_prefix_from_mode(mode: int) -> str:
    if S_ISREG(mode):
        return "F"
    elif S_ISLNK(mode):
        return "L"
    elif S_ISDIR(mode):
        return "D"
    elif S_ISBLK(mode):
        return "BlockDev"
    elif S_ISCHR(mode):
        return "CharDev"
    elif S_ISFIFO(mode):
        return "FIFO"
    elif S_ISSOCK(mode):
        return "Socket"
    else:
        return "# UNKNOWN!"


def generate_path_info(path: str, stat: stat_result) -> str:
    """Generate a single text info line on a specific entry."""
    blocks, links = "", ""

    if stat.st_blocks > 0 and stat.st_blocks * 512 < stat.st_size:
        blocks = "\tblocks:" + str(stat.st_blocks)

    if stat.st_nlink > 1:
        links = "\tlinks:" + str(stat.st_nlink)

    # Long time no see, %-format! This is ~10% faster when CPU-bound
    # than str.format/f-strings and works on all Python 3 versions.
    return "%s\t%s\t%d\t%d\t%d\t%04o\t%d%s%s\n" % (
        get_prefix_from_mode(stat.st_mode),
        quote(path),
        stat.st_size,
        stat.st_uid,
        stat.st_gid,
        stat.st_mode & 0o7777,
        int(stat.st_mtime),
        blocks,
        links,
    )


def process_dir(dir: Path, output: TextIO, cross_mounts: bool) -> None:
    """Recursively generate the cache for a given directory."""
    dirs = []

    try:
        for child in dir.iterdir():
            try:
                # Why not pathlib instead? It does not always cache
                # the lstat call and makes the program ~25% slower.
                stat = child.lstat()
            except:  # noqa
                output.write("# lstat failed: " + str(child.absolute()))
                continue

            if S_ISDIR(stat.st_mode) and not S_ISLNK(stat.st_mode):
                dirs.append((child, stat))  # Handle dirs only after all the files
            else:
                output.write(generate_path_info(child.name, stat))
    except:  # noqa
        output.write("# iterdir failed: " + str(dir.resolve()))

    for subdir, stat in dirs:
        output.write("\n" + generate_path_info(str(subdir.absolute()), stat))
        if subdir.is_mount() and not cross_mounts:
            output.write("# Not crossing mountpoint: " + str(subdir.resolve()))
        else:
            process_dir(subdir, output, cross_mounts)


def process_tree(output_filename: str, path: str, cross_mounts: bool) -> None:
    if output_filename.endswith(".gz"):
        output = gzip.open(output_filename, "wt")
    else:
        output = open(output_filename, "wt")

    output.write(CACHE_HEADER.strip())

    root = Path(path).resolve()
    output.write("\n" + generate_path_info(str(root), root.lstat()))

    process_dir(root, output, cross_mounts)


if __name__ == "__main__":
    args = parse_args()
    process_tree(args.output, args.path, args.cross_mounts)