-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Add pre-commit hook for UTF-8 encoding enforcement #795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| Check File Encoding | ||
| =================== | ||
|
|
||
| Pre-commit hook to ensure all file operations specify UTF-8 encoding. | ||
|
|
||
| This prevents Windows encoding issues where Python defaults to cp1252 instead of UTF-8. | ||
| """ | ||
|
|
||
| import argparse | ||
| import re | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| # Fix Windows console encoding for emoji output | ||
| if sys.platform == "win32": | ||
| try: | ||
| sys.stdout.reconfigure(encoding='utf-8') | ||
| except AttributeError: | ||
| # Python < 3.7 | ||
| import codecs | ||
| sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict') | ||
|
|
||
|
|
||
| class EncodingChecker: | ||
| """Checks Python files for missing UTF-8 encoding parameters.""" | ||
|
|
||
| def __init__(self): | ||
| self.issues = [] | ||
|
|
||
| def check_file(self, filepath: Path) -> bool: | ||
| """ | ||
| Check a single Python file for encoding issues. | ||
|
|
||
| Returns: | ||
| True if file passes checks, False if issues found | ||
| """ | ||
| try: | ||
| content = filepath.read_text(encoding="utf-8") | ||
| except UnicodeDecodeError: | ||
| self.issues.append(f"{filepath}: File is not UTF-8 encoded") | ||
| return False | ||
|
|
||
| file_issues = [] | ||
|
|
||
| # Check 1: open() without encoding | ||
| # Pattern: open(...) without encoding= parameter | ||
| for match in re.finditer(r'open\s*\([^)]+\)', content): | ||
| call = match.group() | ||
|
|
||
| # Skip if it's binary mode (must contain 'b' in mode string) | ||
| # Matches: "rb", "wb", "ab", "r+b", "w+b", etc. | ||
| if re.search(r'["\'][rwax+]*b[rwax+]*["\']', call): | ||
| continue | ||
|
|
||
| # Skip if it already has encoding (use word boundary for robustness) | ||
| if re.search(r'\bencoding\s*=', call): | ||
| continue | ||
|
|
||
| # Get line number | ||
| line_num = content[:match.start()].count('\n') + 1 | ||
| file_issues.append( | ||
| f"{filepath}:{line_num} - open() without encoding parameter" | ||
| ) | ||
|
|
||
| # Check 2: Path.read_text() without encoding | ||
| for match in re.finditer(r'\.read_text\s*\([^)]*\)', content): | ||
| call = match.group() | ||
|
|
||
| # Skip if it already has encoding (use word boundary for robustness) | ||
| if re.search(r'\bencoding\s*=', call): | ||
| continue | ||
|
|
||
| line_num = content[:match.start()].count('\n') + 1 | ||
| file_issues.append( | ||
| f"{filepath}:{line_num} - .read_text() without encoding parameter" | ||
| ) | ||
|
|
||
| # Check 3: Path.write_text() without encoding | ||
| for match in re.finditer(r'\.write_text\s*\([^)]+\)', content): | ||
| call = match.group() | ||
|
|
||
| # Skip if it already has encoding (use word boundary for robustness) | ||
| if re.search(r'\bencoding\s*=', call): | ||
| continue | ||
|
|
||
| line_num = content[:match.start()].count('\n') + 1 | ||
| file_issues.append( | ||
| f"{filepath}:{line_num} - .write_text() without encoding parameter" | ||
| ) | ||
|
|
||
| # Check 4: json.load() with open() without encoding | ||
| for match in re.finditer(r'json\.load\s*\(\s*open\s*\([^)]+\)', content): | ||
| call = match.group() | ||
|
|
||
| # Skip if open() has encoding (use word boundary for robustness) | ||
| if re.search(r'\bencoding\s*=', call): | ||
| continue | ||
|
|
||
| line_num = content[:match.start()].count('\n') + 1 | ||
| file_issues.append( | ||
| f"{filepath}:{line_num} - json.load(open()) without encoding in open()" | ||
| ) | ||
|
|
||
| # Check 5: json.dump() with open() without encoding | ||
| for match in re.finditer(r'json\.dump\s*\([^,]+,\s*open\s*\([^)]+\)', content): | ||
| call = match.group() | ||
|
|
||
| # Skip if open() has encoding (use word boundary for robustness) | ||
| if re.search(r'\bencoding\s*=', call): | ||
| continue | ||
|
|
||
| line_num = content[:match.start()].count('\n') + 1 | ||
| file_issues.append( | ||
| f"{filepath}:{line_num} - json.dump(..., open()) without encoding in open()" | ||
| ) | ||
|
|
||
| self.issues.extend(file_issues) | ||
| return len(file_issues) == 0 | ||
|
|
||
| def check_files(self, filepaths: list[Path]) -> int: | ||
| """ | ||
| Check multiple files. | ||
|
|
||
| Returns: | ||
| Number of files with issues | ||
| """ | ||
| for filepath in filepaths: | ||
| if not filepath.exists(): | ||
| continue | ||
|
|
||
| if not filepath.suffix == '.py': | ||
| continue | ||
|
|
||
| self.check_file(filepath) | ||
|
|
||
| return len([f for f in self.issues if f]) | ||
|
|
||
|
|
||
| def main(): | ||
| """Main entry point for pre-commit hook.""" | ||
| parser = argparse.ArgumentParser( | ||
| description="Check Python files for missing UTF-8 encoding parameters" | ||
| ) | ||
| parser.add_argument( | ||
| 'filenames', | ||
| nargs='*', | ||
| help='Filenames to check' | ||
| ) | ||
| parser.add_argument( | ||
| '--verbose', | ||
| action='store_true', | ||
| help='Show all issues found' | ||
| ) | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
| # Convert filenames to Path objects | ||
| files = [Path(f) for f in args.filenames] | ||
|
|
||
| # Run checks | ||
| checker = EncodingChecker() | ||
| failed_count = checker.check_files(files) | ||
Check noticeCode scanning / CodeQL Unused local variable Note
Variable failed_count is not used.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ✅ Fixed - Removed the unused |
||
|
|
||
| # Report results | ||
| if checker.issues: | ||
| print("❌ Encoding issues found:") | ||
| print() | ||
| for issue in checker.issues: | ||
| print(f" {issue}") | ||
| print() | ||
| print("💡 Fix: Add encoding=\"utf-8\" parameter to file operations") | ||
| print() | ||
| print("Examples:") | ||
| print(' open(path, encoding="utf-8")') | ||
| print(' Path(file).read_text(encoding="utf-8")') | ||
| print(' Path(file).write_text(content, encoding="utf-8")') | ||
| print() | ||
| return 1 | ||
|
|
||
| if args.verbose: | ||
| print(f"✅ All {len(files)} files pass encoding checks") | ||
|
|
||
| return 0 | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(main()) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description mentions that the hook is "Scoped to
apps/backend/", but this configuration will run the hook on all Python files in the repository. To match the described behavior and avoid running the check on files outside the backend application, you should add afilespattern to the hook definition.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
✅ Already Correct - The hook is properly scoped to \ with the \ directive on line 87. This ensures the validation only runs on backend Python files where we've applied the UTF-8 encoding fixes from PR #782. The frontend doesn't need these checks since it doesn't have the same file I/O patterns. Thank you for the review!