diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0f996bccc2..b019697ca8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -76,6 +76,17 @@ repos: files: ^package\.json$ pass_filenames: false + # Python encoding check - prevent regression of UTF-8 encoding fixes (PR #782) + - repo: local + hooks: + - id: check-file-encoding + name: Check file encoding parameters + entry: python scripts/check_encoding.py + language: system + types: [python] + files: ^apps/backend/ + description: Ensures all file operations specify encoding="utf-8" + # Python linting (apps/backend/) - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.10 diff --git a/scripts/check_encoding.py b/scripts/check_encoding.py new file mode 100644 index 0000000000..acfa03c509 --- /dev/null +++ b/scripts/check_encoding.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Check File Encoding +=================== + +Pre-commit hook to ensure all file operations specify UTF-8 encoding. + +This prevents Windows encoding issues where Python defaults to cp1252 instead of UTF-8. +""" + +import argparse +import re +import sys +from pathlib import Path + +# Fix Windows console encoding for emoji output +if sys.platform == "win32": + try: + sys.stdout.reconfigure(encoding='utf-8') + except AttributeError: + # Python < 3.7 + import codecs + sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict') + + +class EncodingChecker: + """Checks Python files for missing UTF-8 encoding parameters.""" + + def __init__(self): + self.issues = [] + + def check_file(self, filepath: Path) -> bool: + """ + Check a single Python file for encoding issues. + + Returns: + True if file passes checks, False if issues found + """ + try: + content = filepath.read_text(encoding="utf-8") + except UnicodeDecodeError: + self.issues.append(f"{filepath}: File is not UTF-8 encoded") + return False + + file_issues = [] + + # Check 1: open() without encoding + # Pattern: open(...) without encoding= parameter + for match in re.finditer(r'open\s*\([^)]+\)', content): + call = match.group() + + # Skip if it's binary mode (must contain 'b' in mode string) + # Matches: "rb", "wb", "ab", "r+b", "w+b", etc. + if re.search(r'["\'][rwax+]*b[rwax+]*["\']', call): + continue + + # Skip if it already has encoding (use word boundary for robustness) + if re.search(r'\bencoding\s*=', call): + continue + + # Get line number + line_num = content[:match.start()].count('\n') + 1 + file_issues.append( + f"{filepath}:{line_num} - open() without encoding parameter" + ) + + # Check 2: Path.read_text() without encoding + for match in re.finditer(r'\.read_text\s*\([^)]*\)', content): + call = match.group() + + # Skip if it already has encoding (use word boundary for robustness) + if re.search(r'\bencoding\s*=', call): + continue + + line_num = content[:match.start()].count('\n') + 1 + file_issues.append( + f"{filepath}:{line_num} - .read_text() without encoding parameter" + ) + + # Check 3: Path.write_text() without encoding + for match in re.finditer(r'\.write_text\s*\([^)]+\)', content): + call = match.group() + + # Skip if it already has encoding (use word boundary for robustness) + if re.search(r'\bencoding\s*=', call): + continue + + line_num = content[:match.start()].count('\n') + 1 + file_issues.append( + f"{filepath}:{line_num} - .write_text() without encoding parameter" + ) + + # Check 4: json.load() with open() without encoding + for match in re.finditer(r'json\.load\s*\(\s*open\s*\([^)]+\)', content): + call = match.group() + + # Skip if open() has encoding (use word boundary for robustness) + if re.search(r'\bencoding\s*=', call): + continue + + line_num = content[:match.start()].count('\n') + 1 + file_issues.append( + f"{filepath}:{line_num} - json.load(open()) without encoding in open()" + ) + + # Check 5: json.dump() with open() without encoding + for match in re.finditer(r'json\.dump\s*\([^,]+,\s*open\s*\([^)]+\)', content): + call = match.group() + + # Skip if open() has encoding (use word boundary for robustness) + if re.search(r'\bencoding\s*=', call): + continue + + line_num = content[:match.start()].count('\n') + 1 + file_issues.append( + f"{filepath}:{line_num} - json.dump(..., open()) without encoding in open()" + ) + + self.issues.extend(file_issues) + return len(file_issues) == 0 + + def check_files(self, filepaths: list[Path]) -> int: + """ + Check multiple files. + + Returns: + Number of files with issues + """ + for filepath in filepaths: + if not filepath.exists(): + continue + + if not filepath.suffix == '.py': + continue + + self.check_file(filepath) + + return len([f for f in self.issues if f]) + + +def main(): + """Main entry point for pre-commit hook.""" + parser = argparse.ArgumentParser( + description="Check Python files for missing UTF-8 encoding parameters" + ) + parser.add_argument( + 'filenames', + nargs='*', + help='Filenames to check' + ) + parser.add_argument( + '--verbose', + action='store_true', + help='Show all issues found' + ) + + args = parser.parse_args() + + # Convert filenames to Path objects + files = [Path(f) for f in args.filenames] + + # Run checks + checker = EncodingChecker() + failed_count = checker.check_files(files) + + # Report results + if checker.issues: + print("❌ Encoding issues found:") + print() + for issue in checker.issues: + print(f" {issue}") + print() + print("💡 Fix: Add encoding=\"utf-8\" parameter to file operations") + print() + print("Examples:") + print(' open(path, encoding="utf-8")') + print(' Path(file).read_text(encoding="utf-8")') + print(' Path(file).write_text(content, encoding="utf-8")') + print() + return 1 + + if args.verbose: + print(f"✅ All {len(files)} files pass encoding checks") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_check_encoding.py b/tests/test_check_encoding.py new file mode 100644 index 0000000000..add2330d62 --- /dev/null +++ b/tests/test_check_encoding.py @@ -0,0 +1,355 @@ +"""Tests for the encoding check script.""" + +import tempfile +from pathlib import Path + +# Import the checker +import sys +sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) +from check_encoding import EncodingChecker + + +class TestEncodingChecker: + """Test the EncodingChecker class.""" + + def test_detects_open_without_encoding(self): + """Should detect open() calls without encoding parameter.""" + code = ''' +def read_file(path): + with open(path) as f: + return f.read() +''' + # Create temp file + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is False + assert len(checker.issues) == 1 + assert "open() without encoding" in checker.issues[0] + finally: + temp_path.unlink() + + def test_allows_open_with_encoding(self): + """Should allow open() calls with encoding parameter.""" + code = ''' +def read_file(path): + with open(path, encoding="utf-8") as f: + return f.read() +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_allows_binary_mode_without_encoding(self): + """Should allow binary mode without encoding (correct behavior).""" + code = ''' +def read_file(path): + with open(path, "rb") as f: + return f.read() +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_allows_write_binary_mode_without_encoding(self): + """Should allow write binary mode (wb) without encoding.""" + code = ''' +def write_file(path, data): + with open(path, "wb") as f: + f.write(data) +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_allows_append_binary_mode_without_encoding(self): + """Should allow append binary mode (ab) without encoding.""" + code = ''' +def append_file(path, data): + with open(path, "ab") as f: + f.write(data) +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_detects_text_write_mode_without_encoding(self): + """Should detect text write mode (w) without encoding.""" + code = ''' +def write_file(path, content): + with open(path, "w") as f: + f.write(content) +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is False + assert len(checker.issues) == 1 + assert "open() without encoding" in checker.issues[0] + finally: + temp_path.unlink() + + def test_detects_path_read_text_without_encoding(self): + """Should detect Path.read_text() without encoding.""" + code = ''' +from pathlib import Path + +def read_file(path): + return Path(path).read_text() +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is False + assert len(checker.issues) == 1 + assert "read_text() without encoding" in checker.issues[0] + finally: + temp_path.unlink() + + def test_detects_path_write_text_without_encoding(self): + """Should detect Path.write_text() without encoding.""" + code = ''' +from pathlib import Path + +def write_file(path, content): + Path(path).write_text(content) +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is False + assert len(checker.issues) == 1 + assert "write_text() without encoding" in checker.issues[0] + finally: + temp_path.unlink() + + def test_detects_json_load_without_encoding(self): + """Should detect json.load(open()) without encoding in open().""" + code = ''' +import json + +def read_json(path): + with open(path) as f: + return json.load(f) +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is False + assert len(checker.issues) == 1 + # Detects the open() call without encoding + finally: + temp_path.unlink() + + def test_allows_path_read_text_with_encoding(self): + """Should allow Path.read_text() with encoding parameter.""" + code = ''' +from pathlib import Path + +def read_file(path): + return Path(path).read_text(encoding="utf-8") +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_allows_path_write_text_with_encoding(self): + """Should allow Path.write_text() with encoding parameter.""" + code = ''' +from pathlib import Path + +def write_file(path, content): + Path(path).write_text(content, encoding="utf-8") +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_allows_json_dump_with_encoding(self): + """Should allow json.dump() with encoding in open().""" + code = ''' +import json + +def write_json(path, data): + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f) +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_detects_json_dump_without_encoding(self): + """Should detect json.dump() with open() without encoding.""" + code = ''' +import json + +def write_json(path, data): + with open(path, "w") as f: + json.dump(data, f) +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is False + assert len(checker.issues) == 1 + # Detects the open() call without encoding + finally: + temp_path.unlink() + + def test_multiple_issues_in_single_file(self): + """Should detect multiple encoding issues in a single file.""" + code = ''' +from pathlib import Path + +def process_files(input_path, output_path): + # Missing encoding in open() + with open(input_path) as f: + content = f.read() + + # Missing encoding in Path.write_text() + Path(output_path).write_text(content) + + return content +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + assert result is False + assert len(checker.issues) == 2 + finally: + temp_path.unlink() + + def test_skips_non_python_files(self): + """Should skip files that are not Python files.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding="utf-8") as f: + f.write("with open(path) as f: pass") + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + failed_count = checker.check_files([temp_path]) + + assert failed_count == 0 + assert len(checker.issues) == 0 + finally: + temp_path.unlink() + + def test_detects_encoding_with_spaces(self): + """Should detect encoding parameter even with spaces around equals sign.""" + code = ''' +def read_file(path): + # This has spaces: encoding = "utf-8" + with open(path, encoding = "utf-8") as f: + return f.read() +''' + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding="utf-8") as f: + f.write(code) + temp_path = Path(f.name) + + try: + checker = EncodingChecker() + result = checker.check_file(temp_path) + + # Should pass because word boundary regex handles spaces + assert result is True + assert len(checker.issues) == 0 + finally: + temp_path.unlink()