From d61918b6a5a771811663f7ececf207c1e280689e Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Wed, 16 Jul 2025 09:53:04 -0700 Subject: [PATCH 01/10] Cache extractions of unsuccessful runs; allow extraction persistence --- .../infoextractors/file_decompression.py | 148 ++++++++++++------ surfactant/infoextractors/ole_file.py | 4 +- surfactant/utils/exit_hook.py | 37 +++++ 3 files changed, 143 insertions(+), 46 deletions(-) create mode 100644 surfactant/utils/exit_hook.py diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index 4695ebaa..55567498 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -9,10 +9,12 @@ import atexit import bz2 import gzip +import json import lzma import os import pathlib import shutil +import sys import tarfile import tempfile import zipfile @@ -26,16 +28,19 @@ from surfactant import ContextEntry from surfactant.configmanager import ConfigManager from surfactant.sbomtypes import SBOM, Software +from surfactant.utils import exit_hook -# Global list to track temp dirs -GLOBAL_TEMP_DIRS_LIST = [] +# Global list to track extracted dirs +# Hash -> Path to extracted directory & Result of array of 2-tuples (install_prefix, extract_path) +EXTRACT_DIRS = {} +EXTRACT_DIRS_PATH = pathlib.Path(tempfile.gettempdir()) / ".surfactant_extracted_dirs.json" -RAR_SUPPORT = {"enabled": True} +HAS_RAR_SUPPORT = False def supports_file(filetype: str) -> Optional[str]: if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"} or ( - filetype == "RAR" and RAR_SUPPORT["enabled"] + filetype == "RAR" and HAS_RAR_SUPPORT ): return filetype return None @@ -57,27 +62,36 @@ def extract_file_info( if compression_format: create_extraction( filename, + software, context_queue, current_context, lambda f, t: decompress_to(f, t, compression_format), ) -# decompress takes a filename and an output folder, and decompresses the file into that folder. -# Returning a boolean indicates an attempt (True) or refusal (False) to decompress. -# Returning a list of 2-tuples indicates that different ContextEntries should be created. The -# first element is the install prefix (or None if not applicable), and the second is the path -# to the extracted folder that should be under the install prefix. def create_extraction( filename: str, + software: Software, context_queue: "Queue[ContextEntry]", current_context: Optional[ContextEntry], decompress: Callable[[str, str], Union[bool, List[Tuple[str, str]]]], ): + """Create extraction context entries for decompressed archive files. + + Args: + filename (str): Path to the archive file to be extracted + software (Software): Software object to associated with the file; used to skip extraction if already processed + context_queue (Queue[ContextEntry]): Queue to add new context entries for extracted content + current_context (Optional[ContextEntry]): Current context entry being processed + decompress (Callable[[str, str], Union[bool, List[Tuple[str, str]]]]): Function that performs + the actual decompression. Takes filename and output folder, returns True/False for success + or a list of tuples containing (install_prefix, extract_path) pairs for multiple entries + """ + install_prefix = "" # Check that archive key exists and filename is same as archive file - if current_context.archive and current_context.archive == filename: + if current_context and current_context.archive and current_context.archive == filename: if current_context.extractPaths is not None and current_context.extractPaths != []: logger.info( f"Already extracted, skipping extraction for archive: {current_context.archive}" @@ -87,19 +101,30 @@ def create_extraction( # Inherit the context entry install prefix for the extracted files install_prefix = current_context.installPrefix - # Create a temporary directory for extraction - temp_folder = create_temp_dir() - # Decompress the file - entries = decompress(filename, temp_folder) - - # Simple case where the decompressor doesn't need multiple entries - if entries is True: - entries = [(None, temp_folder)] - - # If False or an empty list - if not entries: - logger.error(f"Failed to decompress {filename}. No entries created.") - return + if software.sha256 in EXTRACT_DIRS and EXTRACT_DIRS[software.sha256]['result']: + entries = EXTRACT_DIRS[software.sha256]['result'] + logger.info( + f"Using cached extraction entries for {filename}" + ) + else: + # Create a temporary directory for extraction + temp_folder = create_temp_dir() + EXTRACT_DIRS[software.sha256] = {'path': temp_folder, 'result': None} + + # Decompress the file + entries = decompress(filename, temp_folder) + + # Simple case where the decompressor doesn't need multiple entries + if entries is True: + entries = [(None, temp_folder)] + + # If False or an empty list + if not entries: + logger.error(f"Failed to decompress {filename}. No entries created.") + return + + # Store the result in the global EXTRACT_DIRS + EXTRACT_DIRS[software.sha256]['result'] = entries for entry_prefix, extract_path in entries: # Merges our install prefix with the entry's install prefix (where applicable) @@ -150,15 +175,6 @@ def decompress_to(filename: str, output_folder: str, compression_format: str) -> return True -def create_temp_dir(): - # Create a temporary directory - temp_dir = tempfile.mkdtemp(prefix="surfactant-temp") - - # Add to global list of temp dirs to facilitate easier clean up at the end - GLOBAL_TEMP_DIRS_LIST.append(temp_dir) - return temp_dir - - def decompress_zip_file(filename: str, output_folder: str): try: with zipfile.ZipFile(filename, "r") as f: @@ -231,30 +247,74 @@ def decompress_rar_file(filename: str, output_folder: str): logger.info(f"Extracted RAR contents to {output_folder}") -def delete_temp_dirs(): - for temp_dir in GLOBAL_TEMP_DIRS_LIST: - if temp_dir and os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - logger.info(f"Cleaned up temporary directory: {temp_dir}") +def setup_extracted_dirs(): + """Get the list of directories where files have been extracted.""" + should_cache_extractions = ConfigManager().get("decompression", "cache_extractions", True) + if should_cache_extractions and EXTRACT_DIRS_PATH.exists(): + try: + with open(EXTRACT_DIRS_PATH, "r") as f: + GLOBAL_EXTRACT_DIRS = json.load(f) + if not isinstance(GLOBAL_EXTRACT_DIRS, dict): + logger.error(f"Invalid format in {EXTRACT_DIRS_PATH}. Expected a dictionary.") + GLOBAL_EXTRACT_DIRS = {} + except json.JSONDecodeError: + logger.error(f"Failed to read extracted directories from {EXTRACT_DIRS_PATH}.") + GLOBAL_EXTRACT_DIRS = {} + else: + GLOBAL_EXTRACT_DIRS = {} + +def store_extracted_dirs(): + """Store the current extracted directories to a JSON file.""" + should_cache_extractions = ConfigManager().get("decompression", "cache_extractions", True) + if should_cache_extractions: + try: + with open(EXTRACT_DIRS_PATH, "w") as f: + json.dump(EXTRACT_DIRS, f) + except IOError as e: + logger.error(f"Failed to write extracted directories to {EXTRACT_DIRS_PATH}: {e}") + + +def create_temp_dir(): + return tempfile.mkdtemp(prefix="surfactant-temp") + +def delete_temp_dirs(): + exited_gracefully = exit_hook.has_exited_gracefully() + should_cache_extractions = ConfigManager().get("decompression", "cache_extractions", True) + should_persist_extractions = should_cache_extractions and ConfigManager().get("decompression", "persist_extractions", False) + keys = list(EXTRACT_DIRS.keys()) + for key in keys: + # Extraction was in progress or failed; we have no reason to keep it + extraction_failed = not EXTRACT_DIRS[key]['result'] + should_delete = extraction_failed or (not should_persist_extractions and exited_gracefully) + if not should_cache_extractions or should_delete: + temp_dir = EXTRACT_DIRS[key]['path'] + if temp_dir and os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + logger.info(f"Cleaned up temporary directory: {temp_dir}") + del EXTRACT_DIRS[key] @surfactant.plugin.hookimpl def init_hook(command_name: Optional[str] = None) -> None: - RAR_SUPPORT["enabled"] = False - + setup_extracted_dirs() + + global HAS_RAR_SUPPORT + HAS_RAR_SUPPORT = False + should_enable_rar = ConfigManager().get("rar", "enabled", True) if should_enable_rar: try: result = rarfile.tool_setup() if result.setup["open_cmd"][0] in ("UNRAR_TOOL", "UNAR_TOOL"): - RAR_SUPPORT["enabled"] = True + HAS_RAR_SUPPORT = True return except rarfile.RarCannotExec: pass logger.warning( "Install 'Unrar' or 'unar' tool for RAR archive decompression. RAR decompression disabled until installed." ) - - -# Register exit handler -atexit.register(delete_temp_dirs) + +@atexit.register +def cleanup_hook(): + delete_temp_dirs() + store_extracted_dirs() diff --git a/surfactant/infoextractors/ole_file.py b/surfactant/infoextractors/ole_file.py index 5a953ad3..0d8bc43e 100755 --- a/surfactant/infoextractors/ole_file.py +++ b/surfactant/infoextractors/ole_file.py @@ -91,13 +91,13 @@ def extract_file_info( if ole_info["ole"].get("clsid_type") == "MSI": file_decompression.create_extraction( - filename, context_queue, current_context, extract_msi + filename, software, context_queue, current_context, extract_msi ) return ole_info -def extract_ole_info(filename: str) -> object: +def extract_ole_info(filename: str) -> Dict[str, Any]: file_details: Dict[str, Any] = {} with olefile.OleFileIO(filename) as ole: diff --git a/surfactant/utils/exit_hook.py b/surfactant/utils/exit_hook.py new file mode 100644 index 00000000..bcaf1713 --- /dev/null +++ b/surfactant/utils/exit_hook.py @@ -0,0 +1,37 @@ + +import sys + +# https://stackoverflow.com/a/9741784 +class ExitHooks: + def __init__(self): + self.exit_code = None + self.exception = None + + def hook(self): + self._orig_exit = sys.exit + sys.exit = self.exit + sys.excepthook = self.exc_handler + + def exit(self, code=0): + self.exit_code = code + self._orig_exit(code) + + def exc_handler(self, exc_type, exc, *args): + self.exception = exc + +_HOOKS = ExitHooks() +_HOOKS.hook() + +def get_exit_code(): + return _HOOKS.exit_code + +def get_exception(): + return _HOOKS.exception + +def has_exited_gracefully(): + """ + Returns True if the program exited gracefully (without an exception). + """ + if _HOOKS.exit_code is not None and _HOOKS.exit_code != 0: + return False + return _HOOKS.exception is None \ No newline at end of file From 9010f3f7602f7ee87c39d7a611b665cc0dbaa643 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Wed, 16 Jul 2025 09:55:33 -0700 Subject: [PATCH 02/10] Check if folder exists before skipping decompression --- surfactant/infoextractors/file_decompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index 55567498..936e421c 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -101,7 +101,7 @@ def create_extraction( # Inherit the context entry install prefix for the extracted files install_prefix = current_context.installPrefix - if software.sha256 in EXTRACT_DIRS and EXTRACT_DIRS[software.sha256]['result']: + if software.sha256 in EXTRACT_DIRS and EXTRACT_DIRS[software.sha256]['result'] and os.path.exists(EXTRACT_DIRS[software.sha256]['path']): entries = EXTRACT_DIRS[software.sha256]['result'] logger.info( f"Using cached extraction entries for {filename}" From 670b90397abc4abc11c178905d21e3f1fe71ecde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Jul 2025 16:59:44 +0000 Subject: [PATCH 03/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../infoextractors/file_decompression.py | 44 ++++++++++--------- surfactant/utils/exit_hook.py | 10 +++-- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index 936e421c..e87bb64e 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -14,7 +14,6 @@ import os import pathlib import shutil -import sys import tarfile import tempfile import zipfile @@ -39,9 +38,7 @@ def supports_file(filetype: str) -> Optional[str]: - if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"} or ( - filetype == "RAR" and HAS_RAR_SUPPORT - ): + if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"} or (filetype == "RAR" and HAS_RAR_SUPPORT): return filetype return None @@ -101,16 +98,18 @@ def create_extraction( # Inherit the context entry install prefix for the extracted files install_prefix = current_context.installPrefix - if software.sha256 in EXTRACT_DIRS and EXTRACT_DIRS[software.sha256]['result'] and os.path.exists(EXTRACT_DIRS[software.sha256]['path']): - entries = EXTRACT_DIRS[software.sha256]['result'] - logger.info( - f"Using cached extraction entries for {filename}" - ) + if ( + software.sha256 in EXTRACT_DIRS + and EXTRACT_DIRS[software.sha256]["result"] + and os.path.exists(EXTRACT_DIRS[software.sha256]["path"]) + ): + entries = EXTRACT_DIRS[software.sha256]["result"] + logger.info(f"Using cached extraction entries for {filename}") else: # Create a temporary directory for extraction temp_folder = create_temp_dir() - EXTRACT_DIRS[software.sha256] = {'path': temp_folder, 'result': None} - + EXTRACT_DIRS[software.sha256] = {"path": temp_folder, "result": None} + # Decompress the file entries = decompress(filename, temp_folder) @@ -122,9 +121,9 @@ def create_extraction( if not entries: logger.error(f"Failed to decompress {filename}. No entries created.") return - + # Store the result in the global EXTRACT_DIRS - EXTRACT_DIRS[software.sha256]['result'] = entries + EXTRACT_DIRS[software.sha256]["result"] = entries for entry_prefix, extract_path in entries: # Merges our install prefix with the entry's install prefix (where applicable) @@ -262,7 +261,8 @@ def setup_extracted_dirs(): GLOBAL_EXTRACT_DIRS = {} else: GLOBAL_EXTRACT_DIRS = {} - + + def store_extracted_dirs(): """Store the current extracted directories to a JSON file.""" should_cache_extractions = ConfigManager().get("decompression", "cache_extractions", True) @@ -281,26 +281,29 @@ def create_temp_dir(): def delete_temp_dirs(): exited_gracefully = exit_hook.has_exited_gracefully() should_cache_extractions = ConfigManager().get("decompression", "cache_extractions", True) - should_persist_extractions = should_cache_extractions and ConfigManager().get("decompression", "persist_extractions", False) + should_persist_extractions = should_cache_extractions and ConfigManager().get( + "decompression", "persist_extractions", False + ) keys = list(EXTRACT_DIRS.keys()) for key in keys: # Extraction was in progress or failed; we have no reason to keep it - extraction_failed = not EXTRACT_DIRS[key]['result'] + extraction_failed = not EXTRACT_DIRS[key]["result"] should_delete = extraction_failed or (not should_persist_extractions and exited_gracefully) if not should_cache_extractions or should_delete: - temp_dir = EXTRACT_DIRS[key]['path'] + temp_dir = EXTRACT_DIRS[key]["path"] if temp_dir and os.path.exists(temp_dir): shutil.rmtree(temp_dir) logger.info(f"Cleaned up temporary directory: {temp_dir}") del EXTRACT_DIRS[key] + @surfactant.plugin.hookimpl def init_hook(command_name: Optional[str] = None) -> None: setup_extracted_dirs() - + global HAS_RAR_SUPPORT HAS_RAR_SUPPORT = False - + should_enable_rar = ConfigManager().get("rar", "enabled", True) if should_enable_rar: try: @@ -313,7 +316,8 @@ def init_hook(command_name: Optional[str] = None) -> None: logger.warning( "Install 'Unrar' or 'unar' tool for RAR archive decompression. RAR decompression disabled until installed." ) - + + @atexit.register def cleanup_hook(): delete_temp_dirs() diff --git a/surfactant/utils/exit_hook.py b/surfactant/utils/exit_hook.py index bcaf1713..deb844d7 100644 --- a/surfactant/utils/exit_hook.py +++ b/surfactant/utils/exit_hook.py @@ -1,6 +1,6 @@ - import sys + # https://stackoverflow.com/a/9741784 class ExitHooks: def __init__(self): @@ -18,20 +18,24 @@ def exit(self, code=0): def exc_handler(self, exc_type, exc, *args): self.exception = exc - + + _HOOKS = ExitHooks() _HOOKS.hook() + def get_exit_code(): return _HOOKS.exit_code + def get_exception(): return _HOOKS.exception + def has_exited_gracefully(): """ Returns True if the program exited gracefully (without an exception). """ if _HOOKS.exit_code is not None and _HOOKS.exit_code != 0: return False - return _HOOKS.exception is None \ No newline at end of file + return _HOOKS.exception is None From 751ee27540080682d114b010c30b42838c52fbd3 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Wed, 16 Jul 2025 10:06:17 -0700 Subject: [PATCH 04/10] Fix pylint --- .../infoextractors/file_decompression.py | 26 +++++++++++-------- surfactant/utils/exit_hook.py | 24 +++++++++++------ 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index e87bb64e..c5da118d 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -34,11 +34,13 @@ EXTRACT_DIRS = {} EXTRACT_DIRS_PATH = pathlib.Path(tempfile.gettempdir()) / ".surfactant_extracted_dirs.json" -HAS_RAR_SUPPORT = False +RAR_SUPPORT = {"enabled": False} def supports_file(filetype: str) -> Optional[str]: - if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"} or (filetype == "RAR" and HAS_RAR_SUPPORT): + if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"} or ( + filetype == "RAR" and RAR_SUPPORT["enabled"] + ): return filetype return None @@ -296,20 +298,15 @@ def delete_temp_dirs(): logger.info(f"Cleaned up temporary directory: {temp_dir}") del EXTRACT_DIRS[key] - -@surfactant.plugin.hookimpl -def init_hook(command_name: Optional[str] = None) -> None: - setup_extracted_dirs() - - global HAS_RAR_SUPPORT - HAS_RAR_SUPPORT = False - +def setup_rar_support(): + RAR_SUPPORT["enabled"] = False + should_enable_rar = ConfigManager().get("rar", "enabled", True) if should_enable_rar: try: result = rarfile.tool_setup() if result.setup["open_cmd"][0] in ("UNRAR_TOOL", "UNAR_TOOL"): - HAS_RAR_SUPPORT = True + RAR_SUPPORT["enabled"] = True return except rarfile.RarCannotExec: pass @@ -318,7 +315,14 @@ def init_hook(command_name: Optional[str] = None) -> None: ) +@surfactant.plugin.hookimpl +def init_hook(command_name: Optional[str] = None): + """Initialize the file decompression plugin.""" + setup_extracted_dirs() + setup_rar_support() + @atexit.register def cleanup_hook(): + """Clean up temporary directories and store extraction cache on exit.""" delete_temp_dirs() store_extracted_dirs() diff --git a/surfactant/utils/exit_hook.py b/surfactant/utils/exit_hook.py index deb844d7..2897687f 100644 --- a/surfactant/utils/exit_hook.py +++ b/surfactant/utils/exit_hook.py @@ -1,22 +1,28 @@ import sys +from typing import Callable, Any, Optional # https://stackoverflow.com/a/9741784 class ExitHooks: def __init__(self): - self.exit_code = None - self.exception = None + self.exit_code: Optional[int] = None + self.exception: Optional[Exception] = None + self._orig_exit: Optional[Callable[[int], None]] = None def hook(self): + """Install the exit and exception hooks.""" self._orig_exit = sys.exit sys.exit = self.exit sys.excepthook = self.exc_handler - def exit(self, code=0): + def exit(self, code: int = 0): + """Custom exit handler that captures the exit code.""" self.exit_code = code - self._orig_exit(code) + if self._orig_exit is not None: + self._orig_exit(code) - def exc_handler(self, exc_type, exc, *args): + def exc_handler(self, exc_type: type, exc: Exception, *args: Any): + """Custom exception handler that captures exceptions.""" self.exception = exc @@ -24,15 +30,17 @@ def exc_handler(self, exc_type, exc, *args): _HOOKS.hook() -def get_exit_code(): +def get_exit_code() -> Optional[int]: + """Get the exit code from the last program exit.""" return _HOOKS.exit_code -def get_exception(): +def get_exception() -> Optional[Exception]: + """Get the exception from the last unhandled exception.""" return _HOOKS.exception -def has_exited_gracefully(): +def has_exited_gracefully() -> bool: """ Returns True if the program exited gracefully (without an exception). """ From cad2be463de568406247217ecdb67f7586fe138f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:11:22 +0000 Subject: [PATCH 05/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- surfactant/infoextractors/file_decompression.py | 6 ++++-- surfactant/utils/exit_hook.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index c5da118d..49575a2f 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -298,9 +298,10 @@ def delete_temp_dirs(): logger.info(f"Cleaned up temporary directory: {temp_dir}") del EXTRACT_DIRS[key] + def setup_rar_support(): RAR_SUPPORT["enabled"] = False - + should_enable_rar = ConfigManager().get("rar", "enabled", True) if should_enable_rar: try: @@ -320,7 +321,8 @@ def init_hook(command_name: Optional[str] = None): """Initialize the file decompression plugin.""" setup_extracted_dirs() setup_rar_support() - + + @atexit.register def cleanup_hook(): """Clean up temporary directories and store extraction cache on exit.""" diff --git a/surfactant/utils/exit_hook.py b/surfactant/utils/exit_hook.py index 2897687f..015d7f1e 100644 --- a/surfactant/utils/exit_hook.py +++ b/surfactant/utils/exit_hook.py @@ -1,5 +1,5 @@ import sys -from typing import Callable, Any, Optional +from typing import Any, Callable, Optional # https://stackoverflow.com/a/9741784 From b9cfc2f5c63fc96a118cc37f7916928ecb9cc17f Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Wed, 30 Jul 2025 13:51:02 -0700 Subject: [PATCH 06/10] Add configuration of extract folder --- surfactant/infoextractors/file_decompression.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index 49575a2f..8cc22ab1 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -29,10 +29,14 @@ from surfactant.sbomtypes import SBOM, Software from surfactant.utils import exit_hook +EXTRACT_DIR = pathlib.Path(ConfigManager().get("decompression", "extract-dir", tempfile.gettempdir())) +EXTRACT_DIRS_PREFIX = ConfigManager().get("decompression", "extract-prefix", "surfactant-temp") +EXTRACT_DIR.mkdir(parents=True, exist_ok=True) + # Global list to track extracted dirs # Hash -> Path to extracted directory & Result of array of 2-tuples (install_prefix, extract_path) EXTRACT_DIRS = {} -EXTRACT_DIRS_PATH = pathlib.Path(tempfile.gettempdir()) / ".surfactant_extracted_dirs.json" +EXTRACT_DIRS_PATH = EXTRACT_DIR / ".surfactant_extracted_dirs.json" RAR_SUPPORT = {"enabled": False} @@ -109,7 +113,7 @@ def create_extraction( logger.info(f"Using cached extraction entries for {filename}") else: # Create a temporary directory for extraction - temp_folder = create_temp_dir() + temp_folder = create_extract_dir() EXTRACT_DIRS[software.sha256] = {"path": temp_folder, "result": None} # Decompress the file @@ -276,11 +280,11 @@ def store_extracted_dirs(): logger.error(f"Failed to write extracted directories to {EXTRACT_DIRS_PATH}: {e}") -def create_temp_dir(): - return tempfile.mkdtemp(prefix="surfactant-temp") +def create_extract_dir(): + return tempfile.mkdtemp(prefix=EXTRACT_DIRS_PREFIX, dir=EXTRACT_DIR) -def delete_temp_dirs(): +def delete_extract_dirs(): exited_gracefully = exit_hook.has_exited_gracefully() should_cache_extractions = ConfigManager().get("decompression", "cache_extractions", True) should_persist_extractions = should_cache_extractions and ConfigManager().get( @@ -326,5 +330,5 @@ def init_hook(command_name: Optional[str] = None): @atexit.register def cleanup_hook(): """Clean up temporary directories and store extraction cache on exit.""" - delete_temp_dirs() + delete_extract_dirs() store_extracted_dirs() From 9013f5b72baabe1931ca94442125dd485920c75e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 20:58:12 +0000 Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- surfactant/infoextractors/file_decompression.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index dd860ec8..aad27de0 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -29,7 +29,9 @@ from surfactant.sbomtypes import SBOM, Software from surfactant.utils import exit_hook -EXTRACT_DIR = pathlib.Path(ConfigManager().get("decompression", "extract-dir", tempfile.gettempdir())) +EXTRACT_DIR = pathlib.Path( + ConfigManager().get("decompression", "extract-dir", tempfile.gettempdir()) +) EXTRACT_DIRS_PREFIX = ConfigManager().get("decompression", "extract-prefix", "surfactant-temp") EXTRACT_DIR.mkdir(parents=True, exist_ok=True) From efb69eb96990b9cce24e9dbad58ed7306bd00991 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Wed, 30 Jul 2025 14:01:46 -0700 Subject: [PATCH 08/10] Fix pylint --- surfactant/infoextractors/file_decompression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index aad27de0..e94e5999 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -80,7 +80,7 @@ def extract_file_info( software, context_queue, current_context, - lambda f, t: decompress_to(f, t, fmt), + lambda f, t, format=fmt: decompress_to(f, t, format), ) From cf36eba0321366581e1cc209b50df32e6521748d Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Wed, 30 Jul 2025 15:00:09 -0700 Subject: [PATCH 09/10] Add decompression options docs --- docs/settings.md | 11 +++++++++++ surfactant/infoextractors/file_decompression.py | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/settings.md b/docs/settings.md index 39c3a9a4..989fe6a2 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -46,3 +46,14 @@ Surfactant supports overriding database source URLs via the sources section in t ```bash surfactant config sources.js_library_patterns.retirejs https://new-url.com ``` + +## decompression + +- extract_dir + - Directory where files are extracted when decompressing files. Default is the system's temporary directory. +- extract_prefix + - Prefix for extract directories created when decompressing files. Default is `surfactant-temp`. +- cache_extractions + - Controls whether to cache decompressed/extracted files. If Surfactant fails to exit successfully, the cache will be used to prevent re-extractions. If set to `true`, decompressed files will be cached in the `extract_dir` directory. Default is `true`. +- persist_extractions + - Controls whether to persist decompressed files after Surfactant exits successfully. If set to `true`, decompressed files will be kept in the `extract_dir` directory. Default is `false`. \ No newline at end of file diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index e94e5999..2ab13022 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -30,9 +30,9 @@ from surfactant.utils import exit_hook EXTRACT_DIR = pathlib.Path( - ConfigManager().get("decompression", "extract-dir", tempfile.gettempdir()) + ConfigManager().get("decompression", "extract_dir", tempfile.gettempdir()) ) -EXTRACT_DIRS_PREFIX = ConfigManager().get("decompression", "extract-prefix", "surfactant-temp") +EXTRACT_DIRS_PREFIX = ConfigManager().get("decompression", "extract_prefix", "surfactant-temp") EXTRACT_DIR.mkdir(parents=True, exist_ok=True) # Global list to track extracted dirs From d784a436b3877ff2bbae94de275e206d65dc0959 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Jul 2025 22:00:38 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/settings.md b/docs/settings.md index 989fe6a2..be8f8e09 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -56,4 +56,4 @@ Surfactant supports overriding database source URLs via the sources section in t - cache_extractions - Controls whether to cache decompressed/extracted files. If Surfactant fails to exit successfully, the cache will be used to prevent re-extractions. If set to `true`, decompressed files will be cached in the `extract_dir` directory. Default is `true`. - persist_extractions - - Controls whether to persist decompressed files after Surfactant exits successfully. If set to `true`, decompressed files will be kept in the `extract_dir` directory. Default is `false`. \ No newline at end of file + - Controls whether to persist decompressed files after Surfactant exits successfully. If set to `true`, decompressed files will be kept in the `extract_dir` directory. Default is `false`.