Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,17 @@ repos:
files: ^package\.json$
pass_filenames: false

# Python encoding check - prevent regression of UTF-8 encoding fixes (PR #782)
- repo: local
hooks:
- id: check-file-encoding
name: Check file encoding parameters
entry: python scripts/check_encoding.py
language: system
types: [python]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The PR description mentions that the hook is "Scoped to apps/backend/", but this configuration will run the hook on all Python files in the repository. To match the described behavior and avoid running the check on files outside the backend application, you should add a files pattern to the hook definition.

        types: [python]
        files: ^apps/backend/

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✅ Already Correct - The hook is properly scoped to \ with the \ directive on line 87. This ensures the validation only runs on backend Python files where we've applied the UTF-8 encoding fixes from PR #782. The frontend doesn't need these checks since it doesn't have the same file I/O patterns. Thank you for the review!

files: ^apps/backend/
description: Ensures all file operations specify encoding="utf-8"

# Python linting (apps/backend/)
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.10
Expand Down
189 changes: 189 additions & 0 deletions scripts/check_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""
Check File Encoding
===================

Pre-commit hook to ensure all file operations specify UTF-8 encoding.

This prevents Windows encoding issues where Python defaults to cp1252 instead of UTF-8.
"""

import argparse
import re
import sys
from pathlib import Path

# Fix Windows console encoding for emoji output
if sys.platform == "win32":
try:
sys.stdout.reconfigure(encoding='utf-8')
except AttributeError:
# Python < 3.7
import codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')


class EncodingChecker:
"""Checks Python files for missing UTF-8 encoding parameters."""

def __init__(self):
self.issues = []

def check_file(self, filepath: Path) -> bool:
"""
Check a single Python file for encoding issues.

Returns:
True if file passes checks, False if issues found
"""
try:
content = filepath.read_text(encoding="utf-8")
except UnicodeDecodeError:
self.issues.append(f"{filepath}: File is not UTF-8 encoded")
return False

file_issues = []

# Check 1: open() without encoding
# Pattern: open(...) without encoding= parameter
for match in re.finditer(r'open\s*\([^)]+\)', content):
call = match.group()

# Skip if it's binary mode (must contain 'b' in mode string)
# Matches: "rb", "wb", "ab", "r+b", "w+b", etc.
if re.search(r'["\'][rwax+]*b[rwax+]*["\']', call):
continue

# Skip if it already has encoding (use word boundary for robustness)
if re.search(r'\bencoding\s*=', call):
continue

# Get line number
line_num = content[:match.start()].count('\n') + 1
file_issues.append(
f"{filepath}:{line_num} - open() without encoding parameter"
)

# Check 2: Path.read_text() without encoding
for match in re.finditer(r'\.read_text\s*\([^)]*\)', content):
call = match.group()

# Skip if it already has encoding (use word boundary for robustness)
if re.search(r'\bencoding\s*=', call):
continue

line_num = content[:match.start()].count('\n') + 1
file_issues.append(
f"{filepath}:{line_num} - .read_text() without encoding parameter"
)

# Check 3: Path.write_text() without encoding
for match in re.finditer(r'\.write_text\s*\([^)]+\)', content):
call = match.group()

# Skip if it already has encoding (use word boundary for robustness)
if re.search(r'\bencoding\s*=', call):
continue

line_num = content[:match.start()].count('\n') + 1
file_issues.append(
f"{filepath}:{line_num} - .write_text() without encoding parameter"
)

# Check 4: json.load() with open() without encoding
for match in re.finditer(r'json\.load\s*\(\s*open\s*\([^)]+\)', content):
call = match.group()

# Skip if open() has encoding (use word boundary for robustness)
if re.search(r'\bencoding\s*=', call):
continue

line_num = content[:match.start()].count('\n') + 1
file_issues.append(
f"{filepath}:{line_num} - json.load(open()) without encoding in open()"
)

# Check 5: json.dump() with open() without encoding
for match in re.finditer(r'json\.dump\s*\([^,]+,\s*open\s*\([^)]+\)', content):
call = match.group()

# Skip if open() has encoding (use word boundary for robustness)
if re.search(r'\bencoding\s*=', call):
continue

line_num = content[:match.start()].count('\n') + 1
file_issues.append(
f"{filepath}:{line_num} - json.dump(..., open()) without encoding in open()"
)

self.issues.extend(file_issues)
return len(file_issues) == 0

def check_files(self, filepaths: list[Path]) -> int:
"""
Check multiple files.

Returns:
Number of files with issues
"""
for filepath in filepaths:
if not filepath.exists():
continue

if not filepath.suffix == '.py':
continue

self.check_file(filepath)

return len([f for f in self.issues if f])


def main():
"""Main entry point for pre-commit hook."""
parser = argparse.ArgumentParser(
description="Check Python files for missing UTF-8 encoding parameters"
)
parser.add_argument(
'filenames',
nargs='*',
help='Filenames to check'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show all issues found'
)

args = parser.parse_args()

# Convert filenames to Path objects
files = [Path(f) for f in args.filenames]

# Run checks
checker = EncodingChecker()
failed_count = checker.check_files(files)

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable failed_count is not used.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✅ Fixed - Removed the unused failed_count variable from line 164. The variable was only being assigned but never used. Thank you for catching this!


# Report results
if checker.issues:
print("❌ Encoding issues found:")
print()
for issue in checker.issues:
print(f" {issue}")
print()
print("💡 Fix: Add encoding=\"utf-8\" parameter to file operations")
print()
print("Examples:")
print(' open(path, encoding="utf-8")')
print(' Path(file).read_text(encoding="utf-8")')
print(' Path(file).write_text(content, encoding="utf-8")')
print()
return 1

if args.verbose:
print(f"✅ All {len(files)} files pass encoding checks")

return 0


if __name__ == "__main__":
sys.exit(main())
Loading
Loading