From af0cd933e84b9f83210c0f12f95a456606ee79e9 Mon Sep 17 00:00:00 2001 From: David Lakin Date: Thu, 6 Jun 2024 02:17:25 -0400 Subject: [PATCH 1/6] Fix "OSError: [Errno 36] File name too long" in fuzz_submodule Fixes a bug in the `fuzz_submodule` harness where the fuzzed data can produce file names that exceed the maximum size allowed byt the OS. This issue came up previously and was fixed in #1922, but the submodule file name fixed here was missed in that PR. Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=69456 --- fuzzing/fuzz-targets/fuzz_submodule.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fuzzing/fuzz-targets/fuzz_submodule.py b/fuzzing/fuzz-targets/fuzz_submodule.py index ca47690ea..9f5828d8d 100644 --- a/fuzzing/fuzz-targets/fuzz_submodule.py +++ b/fuzzing/fuzz-targets/fuzz_submodule.py @@ -35,12 +35,13 @@ def TestOneInput(data): sub_repo = Repo.init(submodule_temp_dir, bare=fdp.ConsumeBool()) sub_repo.index.commit(fdp.ConsumeUnicodeNoSurrogates(fdp.ConsumeIntInRange(1, 512))) - submodule_name = f"submodule_{fdp.ConsumeUnicodeNoSurrogates(fdp.ConsumeIntInRange(1, 512))}" + submodule_name = fdp.ConsumeUnicodeNoSurrogates( + fdp.ConsumeIntInRange(1, max(1, get_max_filename_length(repo.working_tree_dir))) + ) submodule_path = os.path.join(repo.working_tree_dir, submodule_name) - submodule_url = sub_repo.git_dir - submodule = repo.create_submodule(submodule_name, submodule_path, url=submodule_url) - repo.index.commit(f"Added submodule {submodule_name}") + submodule = repo.create_submodule(submodule_name, submodule_path, url=sub_repo.git_dir) + repo.index.commit("Added submodule") with submodule.config_writer() as writer: key_length = fdp.ConsumeIntInRange(1, max(1, fdp.remaining_bytes())) From 7de1556d3895c718f0f0772530ff7cde5457d9d8 Mon Sep 17 00:00:00 2001 From: David Lakin Date: Thu, 8 Aug 2024 16:54:37 -0400 Subject: [PATCH 2/6] Filter out non-bug exceptions using a pre-defined exception list. This reduces false positive test failures by identifying and gracefully handling exceptions that are explicitly raised by GitPython, thus reducing the false-positive fuzzing test failure rate. --- fuzzing/fuzz-targets/fuzz_submodule.py | 56 +++++++++++++++---- fuzzing/oss-fuzz-scripts/build.sh | 2 +- .../container-environment-bootstrap.sh | 11 ++++ 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/fuzzing/fuzz-targets/fuzz_submodule.py b/fuzzing/fuzz-targets/fuzz_submodule.py index 9f5828d8d..05c543bf8 100644 --- a/fuzzing/fuzz-targets/fuzz_submodule.py +++ b/fuzzing/fuzz-targets/fuzz_submodule.py @@ -1,16 +1,51 @@ +# ruff: noqa: E402 import atheris import sys import os +import traceback import tempfile from configparser import ParsingError -from utils import is_expected_exception_message, get_max_filename_length +from utils import get_max_filename_length +import re + +bundle_dir = os.path.dirname(os.path.abspath(__file__)) if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"): # pragma: no cover - path_to_bundled_git_binary = os.path.abspath(os.path.join(os.path.dirname(__file__), "git")) - os.environ["GIT_PYTHON_GIT_EXECUTABLE"] = path_to_bundled_git_binary + bundled_git_binary_path = os.path.join(bundle_dir, "git") + os.environ["GIT_PYTHON_GIT_EXECUTABLE"] = bundled_git_binary_path from git import Repo, GitCommandError, InvalidGitRepositoryError + +def load_exception_list(file_path): + """Load and parse the exception list from a file.""" + try: + with open(file_path, "r") as file: + lines = file.readlines() + exception_list = set() + for line in lines: + match = re.match(r"(.+):(\d+):", line) + if match: + file_path = match.group(1).strip() + line_number = int(match.group(2).strip()) + exception_list.add((file_path, line_number)) + return exception_list + except FileNotFoundError: + print("File not found: %s", file_path) + return set() + except Exception as e: + print("Error loading exception list: %s", e) + return set() + + +def check_exception_against_list(exception_list, exc_traceback): + """Check if the exception traceback matches any entry in the exception list.""" + for filename, lineno, _, _ in traceback.extract_tb(exc_traceback): + if (filename, lineno) in exception_list: + return True + return False + + if not sys.warnoptions: # pragma: no cover # The warnings filter below can be overridden by passing the -W option # to the Python interpreter command line or setting the `PYTHONWARNINGS` environment variable. @@ -89,17 +124,14 @@ def TestOneInput(data): BrokenPipeError, ): return -1 - except ValueError as e: - expected_messages = [ - "SHA is empty", - "Reference at", - "embedded null byte", - "This submodule instance does not exist anymore", - "cmd stdin was empty", - ] - if is_expected_exception_message(e, expected_messages): + except Exception as e: + exc_traceback = e.__traceback__ + exception_list = load_exception_list(os.path.join(bundle_dir, "explicit-exceptions-list.txt")) + if check_exception_against_list(exception_list, exc_traceback): + print("Exception matches an entry in the exception list.") return -1 else: + print("Exception does not match any entry in the exception list.") raise e diff --git a/fuzzing/oss-fuzz-scripts/build.sh b/fuzzing/oss-fuzz-scripts/build.sh index e0b3a50ab..c156e872d 100644 --- a/fuzzing/oss-fuzz-scripts/build.sh +++ b/fuzzing/oss-fuzz-scripts/build.sh @@ -15,5 +15,5 @@ find "$SRC" -maxdepth 1 \ # Build fuzzers in $OUT. find "$SRC/gitpython/fuzzing" -name 'fuzz_*.py' -print0 | while IFS= read -r -d '' fuzz_harness; do - compile_python_fuzzer "$fuzz_harness" --add-binary="$(command -v git):." + compile_python_fuzzer "$fuzz_harness" --add-binary="$(command -v git):." --add-data="$SRC/explicit-exceptions-list.txt:." done diff --git a/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh b/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh index bbdcf5357..af1ddf014 100755 --- a/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh +++ b/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh @@ -91,6 +91,17 @@ create_seed_corpora_zips "$WORK/qa-assets/gitpython/corpora" prepare_dictionaries_for_fuzz_targets "$WORK/qa-assets/gitpython/dictionaries" "$SRC/gitpython/fuzzing" +pushd "$SRC/gitpython/" +# Search for 'raise' and 'assert' statements in Python files within GitPython's 'git/' directory and its submodules, +# remove trailing colons, and save to 'explicit-exceptions-list.txt'. This file can then be used by fuzz harnesses to +# check exception tracebacks: +# If an exception found by the fuzzer originated in a file + line number in explicit-exceptions-list.txt, then it is not a bug. + +git grep -n --recurse-submodules -e '\braise\b' -e '\bassert\b' -- "git/**/*.py" > "$SRC/explicit-exceptions-list.txt" + +popd + + # The OSS-Fuzz base image has outdated dependencies by default so we upgrade them below. python3 -m pip install --upgrade pip # Upgrade to the latest versions known to work at the time the below changes were introduced: From 799b9cae745f50f2c0c590e8b3e19bfea199c463 Mon Sep 17 00:00:00 2001 From: David Lakin Date: Thu, 8 Aug 2024 18:58:28 -0400 Subject: [PATCH 3/6] Improve `check_exception_against_list` matching logic using regex Changes: - `match_exception_with_traceback` uses regular expressions for more flexible matching of file paths and line numbers. This allows for partial matches and more complex patterns. - Improve `check_exception_against_list` by delegating to `match_exception_with_traceback` for checking tracebacks against exception list entries. - `load_exception_list`: Remains largely unchanged, as it correctly parses the file and line number from each exception entry. However, we ensure the set consists of regex patterns to match against tracebacks. --- fuzzing/fuzz-targets/fuzz_submodule.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fuzzing/fuzz-targets/fuzz_submodule.py b/fuzzing/fuzz-targets/fuzz_submodule.py index 05c543bf8..37f069079 100644 --- a/fuzzing/fuzz-targets/fuzz_submodule.py +++ b/fuzzing/fuzz-targets/fuzz_submodule.py @@ -31,21 +31,27 @@ def load_exception_list(file_path): exception_list.add((file_path, line_number)) return exception_list except FileNotFoundError: - print("File not found: %s", file_path) + print(f"File not found: {file_path}") return set() except Exception as e: - print("Error loading exception list: %s", e) + print(f"Error loading exception list: {e}") return set() -def check_exception_against_list(exception_list, exc_traceback): - """Check if the exception traceback matches any entry in the exception list.""" +def match_exception_with_traceback(exception_list, exc_traceback): + """Match exception traceback with the entries in the exception list.""" for filename, lineno, _, _ in traceback.extract_tb(exc_traceback): - if (filename, lineno) in exception_list: - return True + for file_pattern, line_pattern in exception_list: + if re.fullmatch(file_pattern, filename) and re.fullmatch(line_pattern, str(lineno)): + return True return False +def check_exception_against_list(exception_list, exc_traceback): + """Check if the exception traceback matches any entry in the exception list.""" + return match_exception_with_traceback(exception_list, exc_traceback) + + if not sys.warnoptions: # pragma: no cover # The warnings filter below can be overridden by passing the -W option # to the Python interpreter command line or setting the `PYTHONWARNINGS` environment variable. @@ -128,10 +134,8 @@ def TestOneInput(data): exc_traceback = e.__traceback__ exception_list = load_exception_list(os.path.join(bundle_dir, "explicit-exceptions-list.txt")) if check_exception_against_list(exception_list, exc_traceback): - print("Exception matches an entry in the exception list.") return -1 else: - print("Exception does not match any entry in the exception list.") raise e From 2e9c23995b70372a18edc4d0b143b6b522d3fb39 Mon Sep 17 00:00:00 2001 From: David Lakin Date: Thu, 8 Aug 2024 19:38:06 -0400 Subject: [PATCH 4/6] Extract environment setup and exception checking boilerplate logic Changes: - Simplify exception handling in test harnesses via `handle_exception(e)` in the `except Exception as e:` block. - `setup_git_environment` is a step towards centralizing environment variable and logging configuration set up consistently across different fuzzing scripts. **Only applying it to a single test for now is an intentional choice in case it fails to work in the ClusterFuzz environment!** If it proves successful, a follow-up change set will be welcome. --- fuzzing/fuzz-targets/fuzz_submodule.py | 70 +++------------------ fuzzing/fuzz-targets/utils.py | 87 +++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 62 deletions(-) diff --git a/fuzzing/fuzz-targets/fuzz_submodule.py b/fuzzing/fuzz-targets/fuzz_submodule.py index 37f069079..634572bf2 100644 --- a/fuzzing/fuzz-targets/fuzz_submodule.py +++ b/fuzzing/fuzz-targets/fuzz_submodule.py @@ -1,67 +1,17 @@ -# ruff: noqa: E402 import atheris import sys import os -import traceback import tempfile from configparser import ParsingError -from utils import get_max_filename_length -import re - -bundle_dir = os.path.dirname(os.path.abspath(__file__)) - -if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"): # pragma: no cover - bundled_git_binary_path = os.path.join(bundle_dir, "git") - os.environ["GIT_PYTHON_GIT_EXECUTABLE"] = bundled_git_binary_path - from git import Repo, GitCommandError, InvalidGitRepositoryError +from utils import ( + setup_git_environment, + handle_exception, + get_max_filename_length, +) - -def load_exception_list(file_path): - """Load and parse the exception list from a file.""" - try: - with open(file_path, "r") as file: - lines = file.readlines() - exception_list = set() - for line in lines: - match = re.match(r"(.+):(\d+):", line) - if match: - file_path = match.group(1).strip() - line_number = int(match.group(2).strip()) - exception_list.add((file_path, line_number)) - return exception_list - except FileNotFoundError: - print(f"File not found: {file_path}") - return set() - except Exception as e: - print(f"Error loading exception list: {e}") - return set() - - -def match_exception_with_traceback(exception_list, exc_traceback): - """Match exception traceback with the entries in the exception list.""" - for filename, lineno, _, _ in traceback.extract_tb(exc_traceback): - for file_pattern, line_pattern in exception_list: - if re.fullmatch(file_pattern, filename) and re.fullmatch(line_pattern, str(lineno)): - return True - return False - - -def check_exception_against_list(exception_list, exc_traceback): - """Check if the exception traceback matches any entry in the exception list.""" - return match_exception_with_traceback(exception_list, exc_traceback) - - -if not sys.warnoptions: # pragma: no cover - # The warnings filter below can be overridden by passing the -W option - # to the Python interpreter command line or setting the `PYTHONWARNINGS` environment variable. - import warnings - import logging - - # Fuzzing data causes some modules to generate a large number of warnings - # which are not usually interesting and make the test output hard to read, so we ignore them. - warnings.simplefilter("ignore") - logging.getLogger().setLevel(logging.ERROR) +# Setup the git environment +setup_git_environment() def TestOneInput(data): @@ -131,12 +81,10 @@ def TestOneInput(data): ): return -1 except Exception as e: - exc_traceback = e.__traceback__ - exception_list = load_exception_list(os.path.join(bundle_dir, "explicit-exceptions-list.txt")) - if check_exception_against_list(exception_list, exc_traceback): + if isinstance(e, ValueError) and "embedded null byte" in str(e): return -1 else: - raise e + return handle_exception(e) def main(): diff --git a/fuzzing/fuzz-targets/utils.py b/fuzzing/fuzz-targets/utils.py index f522d2959..97e6eab98 100644 --- a/fuzzing/fuzz-targets/utils.py +++ b/fuzzing/fuzz-targets/utils.py @@ -1,6 +1,9 @@ import atheris # pragma: no cover import os # pragma: no cover -from typing import List # pragma: no cover +import re # pragma: no cover +import traceback # pragma: no cover +import sys # pragma: no cover +from typing import Set, Tuple, List # pragma: no cover @atheris.instrument_func @@ -35,3 +38,85 @@ def get_max_filename_length(path: str) -> int: # pragma: no cover int: The maximum filename length. """ return os.pathconf(path, "PC_NAME_MAX") + + +@atheris.instrument_func +def read_lines_from_file(file_path: str) -> list: + """Read lines from a file and return them as a list.""" + try: + with open(file_path, "r") as f: + return [line.strip() for line in f if line.strip()] + except FileNotFoundError: + print(f"File not found: {file_path}") + return [] + except IOError as e: + print(f"Error reading file {file_path}: {e}") + return [] + + +@atheris.instrument_func +def load_exception_list(file_path: str = "explicit-exceptions-list.txt") -> Set[Tuple[str, str]]: + """Load and parse the exception list from a default or specified file.""" + try: + bundle_dir = os.path.dirname(os.path.abspath(__file__)) + full_path = os.path.join(bundle_dir, file_path) + lines = read_lines_from_file(full_path) + exception_list: Set[Tuple[str, str]] = set() + for line in lines: + match = re.match(r"(.+):(\d+):", line) + if match: + file_path: str = match.group(1).strip() + line_number: str = str(match.group(2).strip()) + exception_list.add((file_path, line_number)) + return exception_list + except Exception as e: + print(f"Error loading exception list: {e}") + return set() + + +@atheris.instrument_func +def match_exception_with_traceback(exception_list: Set[Tuple[str, str]], exc_traceback) -> bool: + """Match exception traceback with the entries in the exception list.""" + for filename, lineno, _, _ in traceback.extract_tb(exc_traceback): + for file_pattern, line_pattern in exception_list: + # Ensure filename and line_number are strings for regex matching + if re.fullmatch(file_pattern, filename) and re.fullmatch(line_pattern, str(lineno)): + return True + return False + + +@atheris.instrument_func +def check_exception_against_list(exc_traceback, exception_file: str = "explicit-exceptions-list.txt") -> bool: + """Check if the exception traceback matches any entry in the exception list.""" + exception_list = load_exception_list(exception_file) + return match_exception_with_traceback(exception_list, exc_traceback) + + +@atheris.instrument_func +def handle_exception(e: Exception) -> int: + """Encapsulate exception handling logic for reusability.""" + exc_traceback = e.__traceback__ + if check_exception_against_list(exc_traceback): + return -1 + else: + raise e + + +@atheris.instrument_func +def setup_git_environment() -> None: + """Set up the environment variables for Git.""" + bundle_dir = os.path.dirname(os.path.abspath(__file__)) + if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"): # pragma: no cover + bundled_git_binary_path = os.path.join(bundle_dir, "git") + os.environ["GIT_PYTHON_GIT_EXECUTABLE"] = bundled_git_binary_path + + if not sys.warnoptions: # pragma: no cover + # The warnings filter below can be overridden by passing the -W option + # to the Python interpreter command line or setting the `PYTHONWARNINGS` environment variable. + import warnings + import logging + + # Fuzzing data causes some modules to generate a large number of warnings + # which are not usually interesting and make the test output hard to read, so we ignore them. + warnings.simplefilter("ignore") + logging.getLogger().setLevel(logging.ERROR) From 27de8676c64b549038b4fdd994a20f1ce996ad5e Mon Sep 17 00:00:00 2001 From: David Lakin Date: Thu, 8 Aug 2024 20:35:13 -0400 Subject: [PATCH 5/6] Fix buggy `git grep` pathspec args To ensure that all necessary files are included in the explicit-exceptions-list.txt file and unwanted files and directories are not. --- .../container-environment-bootstrap.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh b/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh index af1ddf014..924a3cbf3 100755 --- a/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh +++ b/fuzzing/oss-fuzz-scripts/container-environment-bootstrap.sh @@ -92,12 +92,12 @@ create_seed_corpora_zips "$WORK/qa-assets/gitpython/corpora" prepare_dictionaries_for_fuzz_targets "$WORK/qa-assets/gitpython/dictionaries" "$SRC/gitpython/fuzzing" pushd "$SRC/gitpython/" -# Search for 'raise' and 'assert' statements in Python files within GitPython's 'git/' directory and its submodules, -# remove trailing colons, and save to 'explicit-exceptions-list.txt'. This file can then be used by fuzz harnesses to -# check exception tracebacks: -# If an exception found by the fuzzer originated in a file + line number in explicit-exceptions-list.txt, then it is not a bug. +# Search for 'raise' and 'assert' statements in Python files within GitPython's source code and submodules, saving the +# matched file path, line number, and line content to a file named 'explicit-exceptions-list.txt'. +# This file can then be used by fuzz harnesses to check exception tracebacks and filter out explicitly raised or otherwise +# anticipated exceptions to reduce false positive test failures. -git grep -n --recurse-submodules -e '\braise\b' -e '\bassert\b' -- "git/**/*.py" > "$SRC/explicit-exceptions-list.txt" +git grep -n --recurse-submodules -e '\braise\b' -e '\bassert\b' -- '*.py' -- ':!setup.py' -- ':!test/**' -- ':!fuzzing/**' > "$SRC/explicit-exceptions-list.txt" popd From 2ed33345667706c5755708e88c989ede06f2414f Mon Sep 17 00:00:00 2001 From: David Lakin Date: Fri, 9 Aug 2024 00:06:44 -0400 Subject: [PATCH 6/6] Fix order of environment setup and git module import The environment setup must happen before the `git` module is imported, otherwise GitPython won't be able to find the Git executable and raise an exception that causes the ClusterFuzz fuzzer runs to fail. --- fuzzing/fuzz-targets/fuzz_submodule.py | 2 +- pyproject.toml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fuzzing/fuzz-targets/fuzz_submodule.py b/fuzzing/fuzz-targets/fuzz_submodule.py index 634572bf2..997133b70 100644 --- a/fuzzing/fuzz-targets/fuzz_submodule.py +++ b/fuzzing/fuzz-targets/fuzz_submodule.py @@ -3,7 +3,6 @@ import os import tempfile from configparser import ParsingError -from git import Repo, GitCommandError, InvalidGitRepositoryError from utils import ( setup_git_environment, handle_exception, @@ -12,6 +11,7 @@ # Setup the git environment setup_git_environment() +from git import Repo, GitCommandError, InvalidGitRepositoryError def TestOneInput(data): diff --git a/pyproject.toml b/pyproject.toml index 603e2597c..6cf4b3f5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,10 @@ lint.unfixable = [ "test/**" = [ "B018", # useless-expression ] +"fuzzing/fuzz-targets/**" = [ + "E402", # environment setup must happen before the `git` module is imported, thus cannot happen at top of file +] + [tool.codespell] ignore-words-list="gud,doesnt"