From 324a92697331949c9315dc078fa268e6ea9c1fcf Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Fri, 11 Jul 2025 15:20:06 -0700 Subject: [PATCH 01/12] Add extractcode as stopgap for decompression/extraction --- pyproject.toml | 7 +- surfactant/filetypeid/id_extractcode.py | 35 ++++++ surfactant/filetypeid/id_magic.py | 2 +- surfactant/infoextractors/extractcode_file.py | 102 ++++++++++++++++++ .../infoextractors/file_decompression.py | 44 ++------ surfactant/plugin/manager.py | 5 +- 6 files changed, 152 insertions(+), 43 deletions(-) create mode 100644 surfactant/filetypeid/id_extractcode.py create mode 100644 surfactant/infoextractors/extractcode_file.py diff --git a/pyproject.toml b/pyproject.toml index 426293b55..dcdec57e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,17 +56,18 @@ dependencies = [ "tomlkit==0.13.*", "textual==3.*", "requests>=2.32.3", - "rarfile==4.2.*", # Pinned to specific version for potential breaking changes "networkx>=2.6", - "python-msi==0.0.0a2" - ] + "python-msi==0.0.0a2", + "extractcode>=31.0.0", +] dynamic = ["version"] [project.optional-dependencies] macho = ["lief==0.16.6"] java = ["javatools>=1.6,==1.*"] +extract-full = ["extractcode[full]"] test = ["pytest"] dev = ["build", "pre-commit"] docs = ["sphinx", "myst-parser"] diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py new file mode 100644 index 000000000..590cdc17a --- /dev/null +++ b/surfactant/filetypeid/id_extractcode.py @@ -0,0 +1,35 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT +from typing import Optional +from extractcode import archive as ec_archive +from extractcode import sevenzip + +import surfactant.plugin + + +@surfactant.plugin.hookimpl +def identify_file_type(filepath: str) -> Optional[str]: + try: + ec_handler = ec_archive.get_best_handler(filepath) + if ec_handler: + return f"EXTRACTCODE-{ec_handler.name}" + return None + except FileNotFoundError: + return None + +@surfactant.plugin.hookimpl +def init_hook(command_name: Optional[str] = None) -> None: + ec_archive.archive_handlers.append(WimHandler) + +# Add WIM support to extractcode via 7zip +WimHandler = ec_archive.Handler( + name='Microsoft wim', + filetypes=('Windows imaging (WIM) image'), + mimetypes=('application/x-ms-wim',), + extensions=('.wim',), + kind=ec_archive.package, + extractors=[sevenzip.extract], + strict=True +) \ No newline at end of file diff --git a/surfactant/filetypeid/id_magic.py b/surfactant/filetypeid/id_magic.py index f8eb896e8..85b21ae8b 100755 --- a/surfactant/filetypeid/id_magic.py +++ b/surfactant/filetypeid/id_magic.py @@ -252,6 +252,6 @@ def identify_file_type(filepath: str) -> Optional[str]: if magic_bytes[:4] == b"\xed\xab\xee\xdb": return "RPM Package" - return None + return None except FileNotFoundError: return None diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py new file mode 100644 index 000000000..6d9ffcedc --- /dev/null +++ b/surfactant/infoextractors/extractcode_file.py @@ -0,0 +1,102 @@ +# Copyright 2025 Lawrence Livermore National Security, LLC +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT +# Copyright 2025 Lawrence Livermore National Security, LLC +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT +from queue import Queue +from typing import Any, Dict, Optional +from extractcode import archive as ec_archive + +from loguru import logger + +from surfactant.infoextractors.file_decompression import create_extraction +import surfactant.plugin +from surfactant import ContextEntry +from surfactant.sbomtypes import SBOM, Software + +ADDITIONAL_HANDLERS = { + "Linux Kernel Image", + "MSCAB", + "ISCAB", + "DOCKER_GZIP", + "GZIP", + "BZIP2", + "XZ", + "DOCKER_TAR", + "TAR", + "RAR", + "ZIP", + "JAR", + "WAR", + "EAR", + "APK", + "IPA", + "MSIX", + "ZLIB", + "CPIO_BIN big", + "CPIO_BIN little", + "ZSTANDARD", + "ZSTANDARD_DICTIONARY", + "ISO_9660_CD", + "MACOS_DMG", + "RPM Package" +} + +def get_handler(filename, filetype: str) -> Optional[ec_archive.Handler]: + if not filetype: + return None + if filetype.startswith("EXTRACTCODE-"): + name = filetype[len("EXTRACTCODE-") :] + for handler in ec_archive.archive_handlers: + if handler.name == name: + return handler + logger.error(f"Unknown EXTRACTCODE handler: {name}") + handler = ec_archive.get_best_handler(filename) + return handler + + +# pylint: disable=too-many-positional-arguments +@surfactant.plugin.hookimpl +def extract_file_info( + sbom: SBOM, + software: Software, + filename: str, + filetype: str, + context_queue: "Queue[ContextEntry]", + current_context: Optional[ContextEntry], +) -> Optional[Dict[str, Any]]: + # Check if the file is compressed and get its format + handler = get_handler(filename, filetype) + + if handler: + create_extraction( + filename, + context_queue, + current_context, + lambda f, t: decompress_to(f, t, handler), + ) + + +def decompress_to(filename: str, output_folder: str, handler: ec_archive.Handler) -> bool: + extractors = handler.extractors + extractor = None + if len(extractors) == 1: + extractor = extractors[0] + elif len(extractors) == 2: + extractor = lambda f, t: ec_archive.extract_twice(f, t, extractors[0], extractors[1]) + else: + logger.error( + f"Unsupported number of extractors for {filename}: {len(extractors)}" + ) + return False + + logger.info(f"Extracting {filename} ({handler.name}) to {output_folder} using extractcode") + warnings = extractor(filename, output_folder) + if warnings: + for warning in warnings: + logger.warning(f"Warning while extracting {filename}: {warning}") + + return True diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index 4695ebaa1..4d7bc1447 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -19,7 +19,6 @@ from queue import Queue from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union -import rarfile from loguru import logger import surfactant.plugin @@ -30,13 +29,8 @@ # Global list to track temp dirs GLOBAL_TEMP_DIRS_LIST = [] -RAR_SUPPORT = {"enabled": True} - - def supports_file(filetype: str) -> Optional[str]: - if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"} or ( - filetype == "RAR" and RAR_SUPPORT["enabled"] - ): + if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"}: return filetype return None @@ -143,8 +137,6 @@ def decompress_to(filename: str, output_folder: str, compression_format: str) -> ) # Since it doesn't seem to be a compressed tar file, try just decompressing the file return decompress_file(filename, output_folder, compression_format) - elif compression_format == "RAR": - decompress_rar_file(filename, output_folder) else: raise ValueError(f"Unsupported compression format: {compression_format}") return True @@ -222,38 +214,14 @@ def extract_tar_file( logger.info(f"Extracted TAR contents to {output_folder}") -def decompress_rar_file(filename: str, output_folder: str): - try: - rf = rarfile.RarFile(filename) - rf.extractall(path=output_folder) - except rarfile.Error as e: - logger.error(f"Error extracting rar file: {e}") - logger.info(f"Extracted RAR contents to {output_folder}") - - def delete_temp_dirs(): for temp_dir in GLOBAL_TEMP_DIRS_LIST: if temp_dir and os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - logger.info(f"Cleaned up temporary directory: {temp_dir}") - - -@surfactant.plugin.hookimpl -def init_hook(command_name: Optional[str] = None) -> None: - RAR_SUPPORT["enabled"] = False - - should_enable_rar = ConfigManager().get("rar", "enabled", True) - if should_enable_rar: - try: - result = rarfile.tool_setup() - if result.setup["open_cmd"][0] in ("UNRAR_TOOL", "UNAR_TOOL"): - RAR_SUPPORT["enabled"] = True - return - except rarfile.RarCannotExec: - pass - logger.warning( - "Install 'Unrar' or 'unar' tool for RAR archive decompression. RAR decompression disabled until installed." - ) + try: + shutil.rmtree(temp_dir) + logger.info(f"Cleaned up temporary directory: {temp_dir}") + except PermissionError as e: + logger.error(f"Permission error while deleting {temp_dir}: {e}") # Register exit handler diff --git a/surfactant/plugin/manager.py b/surfactant/plugin/manager.py index ff7fb1a92..54475e8e0 100644 --- a/surfactant/plugin/manager.py +++ b/surfactant/plugin/manager.py @@ -15,13 +15,14 @@ def _register_plugins(pm: pluggy.PluginManager) -> None: # pylint: disable=import-outside-toplevel # don't want all these imports as part of the file-level scope - from surfactant.filetypeid import id_extension, id_hex, id_magic + from surfactant.filetypeid import id_extension, id_hex, id_magic, id_extractcode from surfactant.infoextractors import ( a_out_file, coff_file, docker_image, elf_file, file_decompression, + extractcode_file, java_file, js_file, mach_o_file, @@ -48,6 +49,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None: id_magic, id_hex, id_extension, + id_extractcode, a_out_file, coff_file, docker_image, @@ -69,6 +71,7 @@ def _register_plugins(pm: pluggy.PluginManager) -> None: cytrics_reader, native_lib_file, file_decompression, + extractcode_file, ) for plugin in internal_plugins: pm.register(plugin) From f3f77b9c25294eb93a25c7b61a191f722a327111 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Jul 2025 22:31:18 +0000 Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- surfactant/filetypeid/id_extractcode.py | 15 +++++++++------ surfactant/infoextractors/extractcode_file.py | 15 +++++++-------- surfactant/infoextractors/file_decompression.py | 2 +- surfactant/plugin/manager.py | 4 ++-- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index 590cdc17a..7014bda8c 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: MIT from typing import Optional + from extractcode import archive as ec_archive from extractcode import sevenzip @@ -19,17 +20,19 @@ def identify_file_type(filepath: str) -> Optional[str]: except FileNotFoundError: return None + @surfactant.plugin.hookimpl def init_hook(command_name: Optional[str] = None) -> None: ec_archive.archive_handlers.append(WimHandler) + # Add WIM support to extractcode via 7zip WimHandler = ec_archive.Handler( - name='Microsoft wim', - filetypes=('Windows imaging (WIM) image'), - mimetypes=('application/x-ms-wim',), - extensions=('.wim',), + name="Microsoft wim", + filetypes=("Windows imaging (WIM) image"), + mimetypes=("application/x-ms-wim",), + extensions=(".wim",), kind=ec_archive.package, extractors=[sevenzip.extract], - strict=True -) \ No newline at end of file + strict=True, +) diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py index 6d9ffcedc..9991e0d7c 100644 --- a/surfactant/infoextractors/extractcode_file.py +++ b/surfactant/infoextractors/extractcode_file.py @@ -8,13 +8,13 @@ # SPDX-License-Identifier: MIT from queue import Queue from typing import Any, Dict, Optional -from extractcode import archive as ec_archive +from extractcode import archive as ec_archive from loguru import logger -from surfactant.infoextractors.file_decompression import create_extraction import surfactant.plugin from surfactant import ContextEntry +from surfactant.infoextractors.file_decompression import create_extraction from surfactant.sbomtypes import SBOM, Software ADDITIONAL_HANDLERS = { @@ -42,9 +42,10 @@ "ZSTANDARD_DICTIONARY", "ISO_9660_CD", "MACOS_DMG", - "RPM Package" + "RPM Package", } + def get_handler(filename, filetype: str) -> Optional[ec_archive.Handler]: if not filetype: return None @@ -88,15 +89,13 @@ def decompress_to(filename: str, output_folder: str, handler: ec_archive.Handler elif len(extractors) == 2: extractor = lambda f, t: ec_archive.extract_twice(f, t, extractors[0], extractors[1]) else: - logger.error( - f"Unsupported number of extractors for {filename}: {len(extractors)}" - ) + logger.error(f"Unsupported number of extractors for {filename}: {len(extractors)}") return False - + logger.info(f"Extracting {filename} ({handler.name}) to {output_folder} using extractcode") warnings = extractor(filename, output_folder) if warnings: for warning in warnings: logger.warning(f"Warning while extracting {filename}: {warning}") - + return True diff --git a/surfactant/infoextractors/file_decompression.py b/surfactant/infoextractors/file_decompression.py index 4d7bc1447..fd68427f5 100644 --- a/surfactant/infoextractors/file_decompression.py +++ b/surfactant/infoextractors/file_decompression.py @@ -23,12 +23,12 @@ import surfactant.plugin from surfactant import ContextEntry -from surfactant.configmanager import ConfigManager from surfactant.sbomtypes import SBOM, Software # Global list to track temp dirs GLOBAL_TEMP_DIRS_LIST = [] + def supports_file(filetype: str) -> Optional[str]: if filetype in {"TAR", "GZIP", "ZIP", "BZIP2", "XZ"}: return filetype diff --git a/surfactant/plugin/manager.py b/surfactant/plugin/manager.py index 54475e8e0..697167912 100644 --- a/surfactant/plugin/manager.py +++ b/surfactant/plugin/manager.py @@ -15,14 +15,14 @@ def _register_plugins(pm: pluggy.PluginManager) -> None: # pylint: disable=import-outside-toplevel # don't want all these imports as part of the file-level scope - from surfactant.filetypeid import id_extension, id_hex, id_magic, id_extractcode + from surfactant.filetypeid import id_extension, id_extractcode, id_hex, id_magic from surfactant.infoextractors import ( a_out_file, coff_file, docker_image, elf_file, - file_decompression, extractcode_file, + file_decompression, java_file, js_file, mach_o_file, From c4caca74bf4f249cf6000c2526e94d766c646bbf Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Fri, 11 Jul 2025 15:34:43 -0700 Subject: [PATCH 03/12] Actually use file type whitelist --- surfactant/infoextractors/extractcode_file.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py index 9991e0d7c..00b841698 100644 --- a/surfactant/infoextractors/extractcode_file.py +++ b/surfactant/infoextractors/extractcode_file.py @@ -49,14 +49,22 @@ def get_handler(filename, filetype: str) -> Optional[ec_archive.Handler]: if not filetype: return None + + # Check if the filetype is an EXTRACTCODE handler if filetype.startswith("EXTRACTCODE-"): name = filetype[len("EXTRACTCODE-") :] for handler in ec_archive.archive_handlers: if handler.name == name: return handler logger.error(f"Unknown EXTRACTCODE handler: {name}") - handler = ec_archive.get_best_handler(filename) - return handler + + # Additionally handle some more file types that we can already identify from id_magic + if filetype in ADDITIONAL_HANDLERS: + handler = ec_archive.get_best_handler(filename) + if not handler: + logger.warning(f"No handler found for {filetype} ({filename}).") + return handler + return None # pylint: disable=too-many-positional-arguments From 7425ea748c8aba37ed8d763d1769c1b63f77df81 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Fri, 11 Jul 2025 15:37:05 -0700 Subject: [PATCH 04/12] Fix lint --- surfactant/infoextractors/extractcode_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py index 00b841698..55baa579d 100644 --- a/surfactant/infoextractors/extractcode_file.py +++ b/surfactant/infoextractors/extractcode_file.py @@ -95,7 +95,9 @@ def decompress_to(filename: str, output_folder: str, handler: ec_archive.Handler if len(extractors) == 1: extractor = extractors[0] elif len(extractors) == 2: - extractor = lambda f, t: ec_archive.extract_twice(f, t, extractors[0], extractors[1]) + def extract_twice(f: str, t: str) -> Any: + return ec_archive.extract_twice(f, t, extractors[0], extractors[1]) + extractor = extract_twice else: logger.error(f"Unsupported number of extractors for {filename}: {len(extractors)}") return False From d5427479db4bc0e93c49a43a3ca6380a28be8290 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Fri, 11 Jul 2025 16:01:19 -0700 Subject: [PATCH 05/12] Gracefully handle extractcode failure --- surfactant/filetypeid/id_extractcode.py | 42 ++++++++++++------- surfactant/infoextractors/extractcode_file.py | 12 ++++-- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index 7014bda8c..ed0d5f410 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -4,14 +4,26 @@ # SPDX-License-Identifier: MIT from typing import Optional -from extractcode import archive as ec_archive -from extractcode import sevenzip +from loguru import logger import surfactant.plugin +try: + from extractcode import archive as ec_archive + from extractcode import sevenzip + EXTRACTCODE_AVAILABLE = True +except (ImportError, AttributeError) as e: + logger.warning(f"extractcode library not available in file type identification: {e}") + EXTRACTCODE_AVAILABLE = False + ec_archive = None + sevenzip = None + @surfactant.plugin.hookimpl def identify_file_type(filepath: str) -> Optional[str]: + if not EXTRACTCODE_AVAILABLE or ec_archive is None: + return None + try: ec_handler = ec_archive.get_best_handler(filepath) if ec_handler: @@ -23,16 +35,16 @@ def identify_file_type(filepath: str) -> Optional[str]: @surfactant.plugin.hookimpl def init_hook(command_name: Optional[str] = None) -> None: - ec_archive.archive_handlers.append(WimHandler) - - -# Add WIM support to extractcode via 7zip -WimHandler = ec_archive.Handler( - name="Microsoft wim", - filetypes=("Windows imaging (WIM) image"), - mimetypes=("application/x-ms-wim",), - extensions=(".wim",), - kind=ec_archive.package, - extractors=[sevenzip.extract], - strict=True, -) + if EXTRACTCODE_AVAILABLE: + WimHandler = ec_archive.Handler( + name="Microsoft wim", + filetypes=("Windows imaging (WIM) image"), + mimetypes=("application/x-ms-wim",), + extensions=(".wim",), + kind=ec_archive.package, + extractors=[sevenzip.extract], + strict=True, + ) + + ec_archive.archive_handlers.append(WimHandler) + diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py index 55baa579d..fb6075e3f 100644 --- a/surfactant/infoextractors/extractcode_file.py +++ b/surfactant/infoextractors/extractcode_file.py @@ -9,13 +9,16 @@ from queue import Queue from typing import Any, Dict, Optional -from extractcode import archive as ec_archive from loguru import logger import surfactant.plugin from surfactant import ContextEntry from surfactant.infoextractors.file_decompression import create_extraction from surfactant.sbomtypes import SBOM, Software +from surfactant.filetypeid.id_extractcode import EXTRACTCODE_AVAILABLE + +if EXTRACTCODE_AVAILABLE: + from extractcode import archive as ec_archive ADDITIONAL_HANDLERS = { "Linux Kernel Image", @@ -46,7 +49,10 @@ } -def get_handler(filename, filetype: str) -> Optional[ec_archive.Handler]: +def get_handler(filename, filetype: str) -> Optional["ec_archive.Handler"]: + if not EXTRACTCODE_AVAILABLE: + return None + if not filetype: return None @@ -89,7 +95,7 @@ def extract_file_info( ) -def decompress_to(filename: str, output_folder: str, handler: ec_archive.Handler) -> bool: +def decompress_to(filename: str, output_folder: str, handler: "ec_archive.Handler") -> bool: extractors = handler.extractors extractor = None if len(extractors) == 1: From 0aab44b9ad2244ab7526d8887b9fe97ca877137c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Jul 2025 23:01:55 +0000 Subject: [PATCH 06/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- surfactant/filetypeid/id_extractcode.py | 6 +++--- surfactant/infoextractors/extractcode_file.py | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index ed0d5f410..b1a1a1ef9 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -11,6 +11,7 @@ try: from extractcode import archive as ec_archive from extractcode import sevenzip + EXTRACTCODE_AVAILABLE = True except (ImportError, AttributeError) as e: logger.warning(f"extractcode library not available in file type identification: {e}") @@ -23,7 +24,7 @@ def identify_file_type(filepath: str) -> Optional[str]: if not EXTRACTCODE_AVAILABLE or ec_archive is None: return None - + try: ec_handler = ec_archive.get_best_handler(filepath) if ec_handler: @@ -45,6 +46,5 @@ def init_hook(command_name: Optional[str] = None) -> None: extractors=[sevenzip.extract], strict=True, ) - - ec_archive.archive_handlers.append(WimHandler) + ec_archive.archive_handlers.append(WimHandler) diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py index fb6075e3f..efb176685 100644 --- a/surfactant/infoextractors/extractcode_file.py +++ b/surfactant/infoextractors/extractcode_file.py @@ -13,9 +13,9 @@ import surfactant.plugin from surfactant import ContextEntry +from surfactant.filetypeid.id_extractcode import EXTRACTCODE_AVAILABLE from surfactant.infoextractors.file_decompression import create_extraction from surfactant.sbomtypes import SBOM, Software -from surfactant.filetypeid.id_extractcode import EXTRACTCODE_AVAILABLE if EXTRACTCODE_AVAILABLE: from extractcode import archive as ec_archive @@ -52,10 +52,10 @@ def get_handler(filename, filetype: str) -> Optional["ec_archive.Handler"]: if not EXTRACTCODE_AVAILABLE: return None - + if not filetype: return None - + # Check if the filetype is an EXTRACTCODE handler if filetype.startswith("EXTRACTCODE-"): name = filetype[len("EXTRACTCODE-") :] @@ -63,7 +63,7 @@ def get_handler(filename, filetype: str) -> Optional["ec_archive.Handler"]: if handler.name == name: return handler logger.error(f"Unknown EXTRACTCODE handler: {name}") - + # Additionally handle some more file types that we can already identify from id_magic if filetype in ADDITIONAL_HANDLERS: handler = ec_archive.get_best_handler(filename) @@ -101,8 +101,10 @@ def decompress_to(filename: str, output_folder: str, handler: "ec_archive.Handle if len(extractors) == 1: extractor = extractors[0] elif len(extractors) == 2: + def extract_twice(f: str, t: str) -> Any: return ec_archive.extract_twice(f, t, extractors[0], extractors[1]) + extractor = extract_twice else: logger.error(f"Unsupported number of extractors for {filename}: {len(extractors)}") From 1730735c5ea6f6feae357be2f8aa7d633c258b18 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Fri, 11 Jul 2025 16:10:35 -0700 Subject: [PATCH 07/12] Fix lints --- surfactant/filetypeid/id_extractcode.py | 3 ++- surfactant/infoextractors/extractcode_file.py | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index b1a1a1ef9..4260e8efb 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -8,12 +8,13 @@ import surfactant.plugin +from typecode.magic2 import NoMagicLibError try: from extractcode import archive as ec_archive from extractcode import sevenzip EXTRACTCODE_AVAILABLE = True -except (ImportError, AttributeError) as e: +except (ImportError, AttributeError, NoMagicLibError) as e: logger.warning(f"extractcode library not available in file type identification: {e}") EXTRACTCODE_AVAILABLE = False ec_archive = None diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py index efb176685..f6cf84aa7 100644 --- a/surfactant/infoextractors/extractcode_file.py +++ b/surfactant/infoextractors/extractcode_file.py @@ -7,7 +7,7 @@ # # SPDX-License-Identifier: MIT from queue import Queue -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional from loguru import logger @@ -17,8 +17,10 @@ from surfactant.infoextractors.file_decompression import create_extraction from surfactant.sbomtypes import SBOM, Software -if EXTRACTCODE_AVAILABLE: +if EXTRACTCODE_AVAILABLE or TYPE_CHECKING: from extractcode import archive as ec_archive +else: + ec_archive = None ADDITIONAL_HANDLERS = { "Linux Kernel Image", @@ -49,8 +51,8 @@ } -def get_handler(filename, filetype: str) -> Optional["ec_archive.Handler"]: - if not EXTRACTCODE_AVAILABLE: +def get_handler(filename, filetype: str) -> Optional[ec_archive.Handler]: + if not EXTRACTCODE_AVAILABLE or ec_archive is None: return None if not filetype: @@ -95,7 +97,7 @@ def extract_file_info( ) -def decompress_to(filename: str, output_folder: str, handler: "ec_archive.Handler") -> bool: +def decompress_to(filename: str, output_folder: str, handler: ec_archive.Handler) -> bool: extractors = handler.extractors extractor = None if len(extractors) == 1: From f19b04f16d14f6332b7400c4f68a0edc9e413de7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Jul 2025 23:11:10 +0000 Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- surfactant/filetypeid/id_extractcode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index 4260e8efb..7402ed8c5 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -5,10 +5,10 @@ from typing import Optional from loguru import logger +from typecode.magic2 import NoMagicLibError import surfactant.plugin -from typecode.magic2 import NoMagicLibError try: from extractcode import archive as ec_archive from extractcode import sevenzip From 655e8a63fa79aa13dd46b52537b89cbcf25f1505 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Fri, 11 Jul 2025 16:23:35 -0700 Subject: [PATCH 09/12] Fix lints --- surfactant/filetypeid/id_extractcode.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index 7402ed8c5..f12ccc80d 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -14,7 +14,11 @@ from extractcode import sevenzip EXTRACTCODE_AVAILABLE = True -except (ImportError, AttributeError, NoMagicLibError) as e: +# pylint: disable-next=broad-exception-caught +except Exception as e: + # Catch NoMagicLibError and other library-specific errors during import + if type(e).__name__ != "NoMagicLibError" and not isinstance(e, ImportError) and not isinstance(e, AttributeError): + raise e logger.warning(f"extractcode library not available in file type identification: {e}") EXTRACTCODE_AVAILABLE = False ec_archive = None From 5f566b1e3c61415581d971184ca5de6b86de5d59 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Jul 2025 23:24:02 +0000 Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- surfactant/filetypeid/id_extractcode.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index f12ccc80d..fe995d463 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -5,7 +5,6 @@ from typing import Optional from loguru import logger -from typecode.magic2 import NoMagicLibError import surfactant.plugin @@ -17,7 +16,11 @@ # pylint: disable-next=broad-exception-caught except Exception as e: # Catch NoMagicLibError and other library-specific errors during import - if type(e).__name__ != "NoMagicLibError" and not isinstance(e, ImportError) and not isinstance(e, AttributeError): + if ( + type(e).__name__ != "NoMagicLibError" + and not isinstance(e, ImportError) + and not isinstance(e, AttributeError) + ): raise e logger.warning(f"extractcode library not available in file type identification: {e}") EXTRACTCODE_AVAILABLE = False From 76cfd25b57b75bc278b613e3f466cf174f97bf01 Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Fri, 11 Jul 2025 16:28:11 -0700 Subject: [PATCH 11/12] Fix lints --- surfactant/filetypeid/id_extractcode.py | 3 ++- surfactant/infoextractors/extractcode_file.py | 9 +++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index fe995d463..76a87bf08 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -1,7 +1,8 @@ -# Copyright 2023 Lawrence Livermore National Security, LLC +# Copyright 2025 Lawrence Livermore National Security, LLC # See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT + from typing import Optional from loguru import logger diff --git a/surfactant/infoextractors/extractcode_file.py b/surfactant/infoextractors/extractcode_file.py index f6cf84aa7..0f5b90258 100644 --- a/surfactant/infoextractors/extractcode_file.py +++ b/surfactant/infoextractors/extractcode_file.py @@ -2,10 +2,7 @@ # See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT -# Copyright 2025 Lawrence Livermore National Security, LLC -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT + from queue import Queue from typing import TYPE_CHECKING, Any, Dict, Optional @@ -51,7 +48,7 @@ } -def get_handler(filename, filetype: str) -> Optional[ec_archive.Handler]: +def get_handler(filename, filetype: str) -> Optional["ec_archive.Handler"]: if not EXTRACTCODE_AVAILABLE or ec_archive is None: return None @@ -97,7 +94,7 @@ def extract_file_info( ) -def decompress_to(filename: str, output_folder: str, handler: ec_archive.Handler) -> bool: +def decompress_to(filename: str, output_folder: str, handler: "ec_archive.Handler") -> bool: extractors = handler.extractors extractor = None if len(extractors) == 1: From 3e5d9d87b5e07f29965a1bc837670d91d20002bb Mon Sep 17 00:00:00 2001 From: Asriel Margarian Date: Mon, 14 Jul 2025 14:30:33 -0700 Subject: [PATCH 12/12] Fix build error --- pyproject.toml | 3 +-- surfactant/filetypeid/id_extractcode.py | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dcdec57e4..aeecb8e1d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ dependencies = [ # Pinned to specific version for potential breaking changes "networkx>=2.6", "python-msi==0.0.0a2", - "extractcode>=31.0.0", ] dynamic = ["version"] @@ -67,7 +66,7 @@ dynamic = ["version"] [project.optional-dependencies] macho = ["lief==0.16.6"] java = ["javatools>=1.6,==1.*"] -extract-full = ["extractcode[full]"] +extractcode = ["extractcode[full]>=31.0.0"] test = ["pytest"] dev = ["build", "pre-commit"] docs = ["sphinx", "myst-parser"] diff --git a/surfactant/filetypeid/id_extractcode.py b/surfactant/filetypeid/id_extractcode.py index 76a87bf08..2d32cca99 100644 --- a/surfactant/filetypeid/id_extractcode.py +++ b/surfactant/filetypeid/id_extractcode.py @@ -16,13 +16,7 @@ EXTRACTCODE_AVAILABLE = True # pylint: disable-next=broad-exception-caught except Exception as e: - # Catch NoMagicLibError and other library-specific errors during import - if ( - type(e).__name__ != "NoMagicLibError" - and not isinstance(e, ImportError) - and not isinstance(e, AttributeError) - ): - raise e + # Catch any import errors related to extractcode logger.warning(f"extractcode library not available in file type identification: {e}") EXTRACTCODE_AVAILABLE = False ec_archive = None