Skip to content

Iteration on .git-hooks/pretty_xml.py #90

@schwehr

Description

@schwehr

I took a pass at .git-hooks/pretty_xml.py to see if I could improve it. Here is what I came up with. There might be some mistakes and things that can be improved. I removed the --cache flag from one of the git commands. I'm not sure if that's better or not.

#!/usr/bin/env python
"""Formats XML into a more human readable form."""

import argparse
import os
import pathlib
import subprocess

from xml.dom import minidom
from xml.parsers import expat


def remove_whitespace_nodes(node: minidom.Node) -> None:
    """Recursively remove unnecessary whitespace-only text nodes."""
    remove_list = []
    for child in node.childNodes:
        if child.nodeType == minidom.Node.TEXT_NODE and child.data.strip() == "":
            remove_list.append(child)
        elif child.hasChildNodes():
            remove_whitespace_nodes(child)
    for node in remove_list:
        node.parentNode.removeChild(node)


def format_xml(file_path: pathlib.Path, dry_run: bool) -> None:
    """Format XML file using xmllint or fallback to Python."""
    # First try with xmllint.
    str_path = str(file_path)
    command = ["xmllint", "--format", str_path]
    if dry_run:
        command += ["--noout"]
    else:
        command += ["-o", str_path]
    try:
        subprocess.run(command, check=True)
        return
    except subprocess.CalledProcessError:
        # Fall through to minidom and let that show the error.
        pass

    # Fallback to using minidom plus fixing trailing spaces.
    content = file_path.read_text(encoding="utf-8")
    try:
        dom = minidom.parseString(content)
    except expat.ExpatError as e:
        print(f"[error] Failed to parse {file_path}: {e}")
        raise e

    remove_whitespace_nodes(dom)
    pretty_xml = dom.toprettyxml(indent="  ")

    # Avoid extra blank lines.
    pretty_xml = "\n".join([line for line in pretty_xml.splitlines() if line.strip()])
    file_path.write_text(pretty_xml, encoding="utf-8")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action=argparse.BooleanOptionalAction)
    args = parser.parse_args()

    # Get list of changed files.
    command = ["git", "diff", "--name-only", "--diff-filter=ACM"]
    result = subprocess.run(command, capture_output=True, encoding="utf-8").stdout
    files = [pathlib.Path(f) for f in result.splitlines() if f.endswith(".xml")]

    found_xml_files = False
    for file in files:
        found_xml_files = True
        print(f"Formatting {file}")
        format_xml(file, dry_run=args.dry_run)
        if args.dry_run:
            print("Skipping git add for {file}.")
        else:
            subprocess.run(["git", "add", file])

    if not found_xml_files:
        print("Failed to find any xml files with changes.")


if __name__ == "__main__":
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions