diff --git a/.gitignore b/.gitignore index f9457e9d0..0c5d97563 100644 --- a/.gitignore +++ b/.gitignore @@ -279,4 +279,4 @@ $RECYCLE.BIN/ .Trash-* # .nfs files are created when an open file is removed but is still being accessed -.nfs* +.nfs* \ No newline at end of file diff --git a/surfactant/cmd/generate.py b/surfactant/cmd/generate.py index 4698528a4..b8271e4e4 100644 --- a/surfactant/cmd/generate.py +++ b/surfactant/cmd/generate.py @@ -7,10 +7,11 @@ import queue import re import sys -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import click from loguru import logger +from networkx.exception import NetworkXError from surfactant import ContextEntry from surfactant.cmd.internal.generate_utils import SpecimenContextParamType @@ -324,7 +325,7 @@ def sbom( while not contextQ.empty(): entry: ContextEntry = contextQ.get() if entry.archive: - logger.info("Processing parent container " + str(entry.archive)) + logger.info(f"Processing parent container {entry.archive}") # TODO: if the parent archive has an info extractor that does unpacking interally, should the children be added to the SBOM? # current thoughts are (Syft) doesn't provide hash information for a proper SBOM software entry, so exclude these # extractor plugins meant to unpack files could be okay when used on an "archive", but then extractPaths should be empty @@ -419,17 +420,40 @@ def sbom( user_institution_name=recorded_institution, container_prefix=entry.containerPrefix, ) - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught raise RuntimeError(f"Unable to process: {filepath}") from e entries.append(sw_parent) entries.extend(sw_children if sw_children else []) + # ------------------------------------------------------------------------ + # (Optional - Early Injection) Inject symlink paths into each Software entry so SBOM helper handles them + # ------------------------------------------------------------------------ + # Early injection: add symlinks gathered so far so fs_tree sees them + for sw in entries: + if sw.fileName is None: + sw.fileName = [] + if sw.installPath is None: + sw.installPath = [] + # Filename symlinks + for link in filename_symlinks.get(sw.sha256, []): + if link not in sw.fileName: + logger.debug( + f"Injecting filename symlink '{link}' for SHA {sw.sha256}" + ) + sw.fileName.append(link) + # Install-path symlinks + for link in file_symlinks.get(sw.sha256, []): + if link not in sw.installPath: + logger.debug( + f"Injecting install-path symlink '{link}' for SHA {sw.sha256}" + ) + sw.installPath.append(link) new_sbom.add_software_entries(entries, parent_entry=parent_entry) # epath was a file, no need to walk the directory tree continue # epath is a directory, walk it for cdir, dirs, files in os.walk(epath): - logger.info("Processing " + str(cdir)) + logger.info(f"Processing {cdir}") if entry.installPrefix: for dir_ in dirs: @@ -446,8 +470,20 @@ def sbom( epath.as_posix(), entry.installPrefix, dest ) dir_symlinks.append((install_source, install_dest)) - - entries = [] + # Reflect in fs_tree immediately + try: + new_sbom.record_symlink( + install_source, install_dest, subtype="directory" + ) + logger.debug( + f"[fs_tree] (dir) {install_source} → {install_dest}" + ) + except (NetworkXError, ValueError) as e: + logger.warning( + f"Failed to record directory symlink in fs_tree: {install_source} → {install_dest}: {e}" + ) + + entries: List[Software] = [] for file in files: # os.path.join will insert an OS specific separator between cdir and f # need to make sure that separator is a / and not a \ on windows @@ -463,6 +499,7 @@ def sbom( # Dead/infinite links will error so skip them if true_filepath is None: continue + # Compute sha256 hash of the file; skip if the file pointed by the symlink can't be opened try: true_file_sha256 = sha256sum(true_filepath) @@ -471,6 +508,26 @@ def sbom( f"Unable to open symlink {filepath} pointing to {true_filepath}" ) continue + + # Record both source and target paths under the same hash node + install_filepath = real_path_to_install_path( + epath.as_posix(), entry.installPrefix, filepath + ) + install_dest = real_path_to_install_path( + epath.as_posix(), entry.installPrefix, true_filepath + ) + + try: + new_sbom.record_hash_node(install_filepath, true_file_sha256) + new_sbom.record_hash_node(install_dest, true_file_sha256) + logger.debug( + f"[fs_tree] Linked symlink + target by hash: {install_filepath} ↔ {install_dest}" + ) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning( + f"[fs_tree] Failed to link symlink + target by hash for {filepath}: {e}" + ) + # Record the symlink name to be added as a file name # Dead links would appear as a file, so need to check the true path to see # if the thing pointed to is a file or a directory @@ -496,6 +553,22 @@ def sbom( file_symlinks[true_file_sha256].append(install_filepath) else: dir_symlinks.append((install_filepath, install_dest)) + + # Reflect this symlink in fs_tree immediately + try: + subtype = ( + "file" if os.path.isfile(true_filepath) else "directory" + ) + new_sbom.record_symlink( + install_filepath, install_dest, subtype=subtype + ) + logger.debug( + f"[fs_tree] ({subtype}) {install_filepath} → {install_dest}" + ) + except (NetworkXError, ValueError) as e: + logger.warning( + f"Failed to record symlink in fs_tree: {install_filepath} → {install_dest}: {e}" + ) # NOTE Two cases that don't get recorded (but maybe should?) are: # 1. If the file pointed to is outside the extract paths, it won't # appear in the SBOM at all -- is that desirable? If it were included, @@ -509,6 +582,8 @@ def sbom( entry.includeFileExts = [] if not entry.excludeFileExts: entry.excludeFileExts = [] + + # file-type identification and SBOM entry creation if ( ( ftype := pm.hook.identify_file_type( @@ -539,69 +614,52 @@ def sbom( or entry.omitUnrecognizedTypes, container_prefix=entry.containerPrefix, ) - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught raise RuntimeError(f"Unable to process: {filepath}") from e entries.append(sw_parent) entries.extend(sw_children if sw_children else []) + # ------------------------------------------------------------------------ + # (Optional - Early Injection) Inject symlink paths into each Software entry so SBOM helper handles them + # ------------------------------------------------------------------------ + # Early injection for batch (so fs_tree captures aliases) + for sw in entries: + if sw.fileName is None: + sw.fileName = [] + if sw.installPath is None: + sw.installPath = [] + # Filename symlinks + for link in filename_symlinks.get(sw.sha256, []): + if link not in sw.fileName: + logger.debug( + f"Injecting filename symlink '{link}' for SHA {sw.sha256}" + ) + sw.fileName.append(link) + # Install-path symlinks + for link in file_symlinks.get(sw.sha256, []): + if link not in sw.installPath: + logger.debug( + f"Injecting install-path symlink '{link}' for SHA {sw.sha256}" + ) + sw.installPath.append(link) new_sbom.add_software_entries(entries, parent_entry=parent_entry) - # Add symlinks to install paths and file names - for software in new_sbom.software: - # ensure fileName, installPath, and metadata lists for the software entry have been created - # for a user supplied input SBOM, there are no guarantees - if software.fileName is None: - software.fileName = [] - if software.installPath is None: - software.installPath = [] - if software.metadata is None: - software.metadata = [] - if software.sha256 in filename_symlinks: - filename_symlinks_added = [] - for filename in filename_symlinks[software.sha256]: - if filename not in software.fileName: - software.fileName.append(filename) - filename_symlinks_added.append(filename) - if filename_symlinks_added: - # Store information on which file names are symlinks - software.metadata.append({"fileNameSymlinks": filename_symlinks_added}) - if software.sha256 in file_symlinks: - symlinks_added = [] - for full_path in file_symlinks[software.sha256]: - if full_path not in software.installPath: - software.installPath.append(full_path) - symlinks_added.append(full_path) - if symlinks_added: - # Store information on which install paths are symlinks - software.metadata.append({"installPathSymlinks": symlinks_added}) - - # Add directory symlink destinations to extract/install paths - for software in new_sbom.software: - # NOTE: this probably doesn't actually add any containerPath symlinks - for paths in (software.containerPath, software.installPath): - if paths is None: - continue - paths_to_add = [] - for path in paths: - for link_source, link_dest in dir_symlinks: - if path.startswith(link_dest): - # Replace the matching start with the symlink instead - # We can't use os.path.join here because we end up with absolute paths after - # removing the common start. - paths_to_add.append(path.replace(link_dest, link_source, 1)) - if paths_to_add: - found_md_installpathsymlinks = False - # make sure software.metadata list has been initialized - if software.metadata is None: - software.metadata = [] - if isinstance(software.metadata, Iterable): - for md in software.metadata: - if isinstance(md, Dict) and "installPathSymlinks" in md: - found_md_installpathsymlinks = True - md["installPathSymlinks"] += paths_to_add - if not found_md_installpathsymlinks: - software.metadata.append({"installPathSymlinks": paths_to_add}) - paths += paths_to_add + # ------------------------------------------------------------------ + # Expand deferred directory symlinks once fs_tree is fully populated + # ------------------------------------------------------------------ + new_sbom.expand_pending_dir_symlinks() + + # ------------------------------------------------------------------ + # Expand deferred file symlinks after all installPath nodes are added + # ------------------------------------------------------------------ + new_sbom.expand_pending_file_symlinks() + + # ------------------------------------------------------------------ + # Inject legacy-style symlink metadata (fileNameSymlinks and + # installPathSymlinks) derived from fs_tree relationships + # ------------------------------------------------------------------ + new_sbom.inject_symlink_metadata() + else: logger.info("Skipping gathering file metadata and adding software entries") diff --git a/surfactant/cmd/merge.py b/surfactant/cmd/merge.py index c8214ec76..3888bc2c7 100644 --- a/surfactant/cmd/merge.py +++ b/surfactant/cmd/merge.py @@ -98,7 +98,11 @@ def merge( merged_sbom.merge(sbom_m) # Find root nodes: those with zero incoming edges - roots = [n for n, deg in merged_sbom.graph.in_degree() if deg == 0] + roots = [ + n + for n, deg in merged_sbom.graph.in_degree() + if deg == 0 and merged_sbom.graph.nodes.get(n, {}).get("type") != "Path" + ] logger.info(f"ROOT NODES: {roots}") # Detect any directed cycles @@ -122,6 +126,8 @@ def merge( if config and "systemRelationship" in config: system_relationship = config["systemRelationship"] for root_uuid in roots: + if merged_sbom.graph.nodes.get(root_uuid, {}).get("type") == "Path": + continue merged_sbom.create_relationship(system_obj.UUID, root_uuid, system_relationship) else: logger.warning( diff --git a/surfactant/output/cytrics_writer.py b/surfactant/output/cytrics_writer.py index 8bd5a276d..8614c2fcc 100755 --- a/surfactant/output/cytrics_writer.py +++ b/surfactant/output/cytrics_writer.py @@ -4,6 +4,8 @@ # SPDX-License-Identifier: MIT from typing import Optional +from loguru import logger + import surfactant.plugin from surfactant.sbomtypes import SBOM @@ -11,6 +13,7 @@ @surfactant.plugin.hookimpl def write_sbom(sbom: SBOM, outfile) -> None: # outfile is a file pointer, not a file name + logger.debug("writing SBOM") outfile.write(sbom.to_json(indent=2)) diff --git a/surfactant/relationships/_internal/windows_utils.py b/surfactant/relationships/_internal/windows_utils.py index cf1245c80..863b93233 100644 --- a/surfactant/relationships/_internal/windows_utils.py +++ b/surfactant/relationships/_internal/windows_utils.py @@ -37,3 +37,47 @@ def find_installed_software( # matching probe directory and filename, add software to list possible_matches.append(e) return possible_matches + + +# construct a list of directories to probe for establishing dotnet relationships +def get_dotnet_probedirs(software: Software, refCulture, refName, dnProbingPaths): + probedirs = [] + # probe for the referenced assemblies + if isinstance(software.installPath, Iterable): + for install_filepath in software.installPath: + install_basepath = pathlib.PureWindowsPath(install_filepath).parent.as_posix() + if refCulture is None or refCulture == "": + # [application base] / [assembly name].dll + # [application base] / [assembly name] / [assembly name].dll + probedirs.append(pathlib.PureWindowsPath(install_basepath).as_posix()) + probedirs.append(pathlib.PureWindowsPath(install_basepath, refName).as_posix()) + if dnProbingPaths is not None: + # add probing private paths + for path in dnProbingPaths: + # [application base] / [binpath] / [assembly name].dll + # [application base] / [binpath] / [assembly name] / [assembly name].dll + probedirs.append(pathlib.PureWindowsPath(install_basepath, path).as_posix()) + probedirs.append( + pathlib.PureWindowsPath(install_basepath, path, refName).as_posix() + ) + else: + # [application base] / [culture] / [assembly name].dll + # [application base] / [culture] / [assembly name] / [assembly name].dll + probedirs.append(pathlib.PureWindowsPath(install_basepath, refCulture).as_posix()) + probedirs.append( + pathlib.PureWindowsPath(install_basepath, refName, refCulture).as_posix() + ) + if dnProbingPaths is not None: + # add probing private paths + for path in dnProbingPaths: + # [application base] / [binpath] / [culture] / [assembly name].dll + # [application base] / [binpath] / [culture] / [assembly name] / [assembly name].dll + probedirs.append( + pathlib.PureWindowsPath(install_basepath, path, refCulture).as_posix() + ) + probedirs.append( + pathlib.PureWindowsPath( + install_basepath, path, refName, refCulture + ).as_posix() + ) + return probedirs diff --git a/surfactant/relationships/dotnet_relationship.py b/surfactant/relationships/dotnet_relationship.py index 40827320a..d859c411f 100644 --- a/surfactant/relationships/dotnet_relationship.py +++ b/surfactant/relationships/dotnet_relationship.py @@ -1,31 +1,300 @@ -# Copyright 2023 Lawrence Livermore National Security, LLC -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT import pathlib from collections.abc import Iterable from typing import List, Optional +from loguru import logger + import surfactant.plugin +from surfactant.relationships._internal.windows_utils import ( + find_installed_software, + get_dotnet_probedirs, +) from surfactant.sbomtypes import SBOM, Relationship, Software +from surfactant.utils.paths import normalize_path -from ._internal.windows_utils import find_installed_software +# ------------------------------------------------------------------------- +# Legacy Documentation +# ------------------------------------------------------------------------- +# Unmanaged (.dll/.so/.dylib) resolution background +# +# Reference: +# https://learn.microsoft.com/en-us/dotnet/core/dependency-loading/loading-unmanaged +# +# The .NET runtime resolves unmanaged/native libraries through a multistage +# search process: +# +# 1. Check the active AssemblyLoadContext cache. +# +# 2. Invoke any resolver registered via SetDllImportResolver(). +# - Example using SetDllImportResolver: +# https://learn.microsoft.com/en-us/dotnet/standard/native-interop/native-library-loading +# - Behavior: +# * Checks the PInvoke or Assembly-level DefaultDllImportSearchPathsAttribute, +# then the assembly's directory, then calls LoadLibraryEx with the +# LOAD_WITH_ALTERED_SEARCH_PATH flag (on Windows). +# - DefaultDllImportSearchPathsAttribute notes: +# * Has no effect on non-Windows platforms / Mono runtime. +# * API reference: +# https://learn.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.defaultdllimportsearchpathsattribute?view=net-7.0 +# * Its "Paths" property is a bitwise combination of values from: +# https://learn.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.dllimportsearchpath?view=net-7.0 +# +# 3. The active AssemblyLoadContext calls its LoadUnmanagedDll function +# (default behavior is effectively the same as assembly reference probing). +# - a. This method can be overridden to provide custom unmanaged library +# resolution logic. +# * The default implementation returns IntPtr.Zero, which tells the +# runtime to continue using its normal unmanaged library resolution +# policy. +# - b. API reference: +# https://learn.microsoft.com/en-us/dotnet/api/system.runtime.loader.assemblyloadcontext.loadunmanageddll?view=net-7.0 +# +# 4. Run default unmanaged library probing logic by parsing *.deps.json +# probing properties. +# - a. If the json file isn't present, assume the calling assembly's +# directory contains the library. +# - b. Reference: +# https://learn.microsoft.com/en-us/dotnet/core/dependency-loading/default-probing#unmanaged-native-library-probing +# +# 5. Platform-specific probing notes: +# - On Linux, if the libname ends with ".so" or contains ".so.", the +# runtime attempts version variations such as: +# libfoo.so +# libfoo.so.1 +# libfoo.so.1.2.3 +# (Legacy code references Issue #79 indicating regex-based matching for +# version variations was needed but not yet implemented.) +# +# 6. Legacy Surfactant unmanaged resolution logic: +# - Construct a list of candidate filenames following the rules outlined +# earlier in SetDllImportResolver behavior. +# - Candidate list includes: +# refName +# refName.dll +# refName.exe +# refName.so +# refName.dylib +# lib.so +# lib.dylib +# lib +# - This list corresponds to the combinations described in (2.a). +# (Legacy comments reference Issue #80, noting the need to verify that +# these candidate combinations behave correctly across platforms.) +# - Versioned ".so" variations were NOT evaluated. +# (Related to Issue #79 — regex matching needed for versioned .so names.) +# - Determine probing directories by taking the parent directory of each +# entry in software.installPath. +# - Search for all candidate filenames using find_installed_software(). +# +# 7. Absolute-path unmanaged imports: +# - If the DllImport / PInvoke name is an absolute Windows path: +# * Convert to PureWindowsPath. +# * Compare this absolute path against every installPath entry of every +# software node in the SBOM. +# * A relationship is created only if an exact match is found. +# - When an absolute path is used, no probing or filename variants are +# attempted. +# ------------------------------------------------------------------------- + +# ------------------------------------------------------------------------- +# Managed (.dll/.exe) assembly resolution background +# +# Reference: +# https://learn.microsoft.com/en-us/dotnet/framework/deployment/how-the-runtime-locates-assemblies +# +# The .NET runtime locates managed assemblies using the following steps: +# +# 1. Determine the correct assembly version using configuration files +# (binding redirects, code location, culture, version, etc.). +# +# 2. Check whether the assembly name has already been bound; if so, the runtime +# uses the previously loaded assembly instead of probing. +# +# 3. Check the Global Assembly Cache (GAC) +# - %WINDIR%\Microsoft.NET\assembly (used by .NET Framework 4 and later) +# - %WINDIR%\assembly (used by earlier .NET Framework versions) +# +# 4. Probe for assembly: +# +# a. Check for a element in the app configuration file. +# - If is present, the runtime checks the specified +# location for the assembly. +# - If the assembly is found, probing stops entirely. +# - If the assembly is not found, the runtime fails without performing +# any further probing. +# - The href may be: +# * http:// or https:// +# * file:// +# * a relative path (interpreted as relative to the application's +# base directory, i.e., the parent of installPath). +# +# b. If there is no element, begin standard probing: +# - Search application base + culture + assembly name directories. +# - Search privatePath directories from a element, combined +# with culture / application base / assembly name. +# (privatePath directories are evaluated before standard probing +# locations.) +# - The location of the calling assembly may be used as a hint for where +# to find the referenced assembly. +# +# 5. Standard probing when no element is present: +# +# Application-base probing: +# [appBase] / .dll +# [appBase] / / .dll +# +# Culture-specific probing: +# [appBase] / / .dll +# [appBase] / / / .dll +# +# Probing via : +# - privatePath values are split on ";" (Windows convention). +# - For each privatePath: +# * [appBase] / / .dll +# * [appBase] / / / .dll +# +# Calling-assembly influence: +# - The location of the calling assembly may be used as a hint for +# where to find the referenced assembly. +# - The legacy implementation approximated this by probing the +# parent directory of each installPath entry. +# +# 6. Missing assemblies: +# - The legacy implementation intentionally did not log assemblies that +# were not found, because such messages would overwhelmingly consist of +# unresolved system/core .NET assemblies and produce excessive noise. +# ------------------------------------------------------------------------- + +# ------------------------------------------------------------------------- +# Probing directory construction rules (legacy logic) +# +# For each software.installPath entry: +# install_basepath = dirname(installPath) +# +# For each referenced assembly: +# +# If Culture is None or empty ("neutral"): +# Add directories corresponding to: +# [application base] / [assembly name].dll +# [application base] / [assembly name] / [assembly name].dll +# +# If dnProbingPaths (from ) exists: +# For each binPath in dnProbingPaths: +# [application base] / [binpath] / [assembly name].dll +# [application base] / [binpath] / [assembly name] / [assembly name].dll +# +# If Culture is specified: +# Add directories corresponding to: +# [application base] / [culture] / [assembly name].dll +# [application base] / [culture] / [assembly name] / [assembly name].dll +# +# If dnProbingPaths exists: +# For each binPath in dnProbingPaths: +# [application base] / [binpath] / [culture] / [assembly name].dll +# [application base] / [binpath] / [culture] / [assembly name] / [assembly name].dll +# +# Notes: +# * dnProbingPaths is derived from: +# appConfigFile.runtime.assemblyBinding.probing.privatePath +# and is split on ";" (Windows convention). +# * These directory patterns mirror the .NET Framework probing rules +# documented in: +# https://learn.microsoft.com/en-us/dotnet/framework/deployment/how-the-runtime-locates-assemblies +# ------------------------------------------------------------------------- + +# ------------------------------------------------------------------------- +# Absolute-path unmanaged imports behavior (legacy) +# +# def is_absolute_path(fname: str) -> bool: +# return PureWindowsPath(fname).is_absolute() +# +# Example: +# +# +# Legacy logic: +# - If the import Name is an absolute path: +# * Convert to PureWindowsPath. +# * For each software entry in the SBOM: +# For each installPath of that entry: +# If the absolute path exactly matches installPath: +# → Create a Relationship(dependent_uuid, match.UUID, "Uses") +# +# - If absolute, no probing or variant-name construction occurs. +# +# - If not absolute: +# → Apply unmanaged probing behavior: +# * Candidate filename list (dll/so/dylib/lib variants) +# * Search installer directories via find_installed_software() +# ------------------------------------------------------------------------- def has_required_fields(metadata) -> bool: - # dotnetAssemblyRef must present, otherwise we have no info on .NET imports + """ + Check whether the metadata includes .NET assembly references. + """ return "dotnetAssemblyRef" in metadata +def is_absolute_path(fname: str) -> bool: + givenpath = pathlib.PureWindowsPath(fname) + return givenpath.is_absolute() + + @surfactant.plugin.hookimpl def establish_relationships( sbom: SBOM, software: Software, metadata ) -> Optional[List[Relationship]]: + """ + SurfActant plugin: Establish 'Uses' relationships for .NET assembly dependencies. + + Implements a 3-phase resolution strategy for managed (.dll/.exe) assemblies: + + 1. Primary (fs_tree exact-path resolution): + - Construct concrete probe paths using legacy .NET probing rules + (get_dotnet_probedirs). + - Resolve each candidate path via sbom.get_software_by_path(), leveraging the fs_tree + symlink edges. + - (COMMENTED OUT) Apply identity filters (version and culture) when metadata is present. + - This phase provides the most precise resolution. + + 2. Secondary (legacy full scan fallback): + - Executed only if Phase 1 finds no matches. + - Reproduces legacy behavior exactly: + * Probe the same legacy probing directories. + * Match strictly on refName + ".dll". + * Use find_installed_software() without version or culture filtering. + - This phase intentionally prioritizes compatibility over precision. + + 3. Finalization: + - Deduplicate matches and emit 'Uses' relationships. + - Record which resolution method produced each match for debugging. + + Also supports: + - Resolving unmanaged/native libraries via dotnetImplMap: + * Absolute-path fast path (fs_tree-aware) + * Legacy filename variants (.dll/.so/.dylib/lib*) + * Directory-based probing + - Honoring .NET app.config rules + - Honoring relative paths + - Avoiding self-dependencies + + Args: + sbom (SBOM): The current SBOM graph. + software (Software): The importing software. + metadata (dict): Parsed metadata for .NET imports. + + Returns: + Optional[List[Relationship]]: A list of 'Uses' relationships, or None if + no applicable .NET metadata is present. + """ + if not has_required_fields(metadata): + logger.debug(f"[.NET] Skipping: No usable .NET metadata for {software.UUID}") return None relationships: List[Relationship] = [] dependent_uuid = software.UUID + + # The following variables declared in legacy but never used and are kept for potential future use dnName = None dnCulture = None dnVersion = None @@ -38,16 +307,17 @@ def establish_relationships( if "Version" in dnAssembly: dnVersion = dnAssembly["Version"] + # --- Extract appConfig metadata --- # get additional probing paths if they exist dnProbingPaths = None dnDependentAssemblies = None windowsAppConfig = None - windowsManifest = None - if "appConfigFile" in metadata: - windowsAppConfig = metadata["appConfigFile"] + windowsManifest = None # This variable was declared in legacy but never used and is kept for potential future use if "manifestFile" in metadata: windowsManifest = metadata["manifestFile"] + if "appConfigFile" in metadata: + windowsAppConfig = metadata["appConfigFile"] if windowsAppConfig: if "runtime" in windowsAppConfig: @@ -65,45 +335,70 @@ def establish_relationships( dnProbingPaths = [] dnProbingPaths.append(pathlib.PureWindowsPath(path).as_posix()) - # https://learn.microsoft.com/en-us/dotnet/core/dependency-loading/loading-unmanaged - # 1. Check the active AssemblyLoadContext cache - # 2. Calling the import resolver set by the setDllImportResolver function - # - a. Example using SetDllImportResolver: https://learn.microsoft.com/en-us/dotnet/standard/native-interop/native-library-loading - # - b. Checks PInvoke's or Assembly's DefaultDllImportSearchPathsAttribute, then the assembly's directory, then LoadLibraryEx with LOAD_WITH_ALTERED_SEARCH_PATH flag - # - This attribute has no effect on non-Windows platforms / Mono runtime - # - https://learn.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.defaultdllimportsearchpathsattribute?view=net-7.0 - # - i. This has a "Paths" property which is a bitwise combination of paths specified in ii: - # - ii. https://learn.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.dllimportsearchpath?view=net-7.0 - # 3. The active AssemblyLoadContext calls its LoadUnmanagedDll function (Default behavior is the same as AssemblyRef probing?) - # - a. Can be overridden, but the default implementation returns IntPtr.Zero, which tells the runtime to load with its default policy. - # - b. https://learn.microsoft.com/en-us/dotnet/api/system.runtime.loader.assemblyloadcontext.loadunmanageddll?view=net-7.0 - # 4. Run default unmanaged library probing logic by parsing *.deps.json probing properties - # - a. If the json file isn't present, assume the calling assembly's directory contains the library - # - b. https://learn.microsoft.com/en-us/dotnet/core/dependency-loading/default-probing#unmanaged-native-library-probing + # --- Handle unmanaged libraries from dotnetImplMap --- if "dotnetImplMap" in metadata: for asmRef in metadata["dotnetImplMap"]: if "Name" not in asmRef: continue refName = asmRef["Name"] - # Check absolute path against entries in software + # Absolute path fast path (restores legacy behavior, but fs_tree-aware) + # + # Legacy did: + # - If Name is an absolute Windows path, compare that absolute path + # directly against all Software.installPath entries. + # - If it matches, emit Uses and skip probing entirely. + # + # New behavior: + # - Normalize the absolute path to POSIX style. + # - Use sbom.get_software_by_path(norm) so we benefit from fs_tree + # - Skip self (dependent_uuid) to avoid self-loops. + if is_absolute_path(refName): + norm = normalize_path(refName) + # 1) Graph-first: fs_tree + symlink edges + match = sbom.get_software_by_path(norm, case_insensitive=True) + if match and match.UUID != dependent_uuid: + logger.debug( + f"[.NET][unmanaged][abs] {refName} (norm={norm}) → UUID={match.UUID}" + ) + relationships.append(Relationship(dependent_uuid, match.UUID, "Uses")) + continue + + # 2) Legacy fallback: strict PureWindowsPath equality across all installPath entries ref_abspath = pathlib.PureWindowsPath(refName) + legacy_found = False for e in sbom.software: - if e.installPath is None: + if e.installPath is None or e.UUID == dependent_uuid: continue - if isinstance(e.installPath, Iterable): + if isinstance(e.installPath, Iterable) and not isinstance( + e.installPath, (str, bytes) + ): for ifile in e.installPath: if ref_abspath == pathlib.PureWindowsPath(ifile): + logger.debug( + f"[.NET][unmanaged][abs] {refName} → UUID={e.UUID} [legacy_fallback]" + ) relationships.append(Relationship(dependent_uuid, e.UUID, "Uses")) + legacy_found = True + + if not legacy_found: + logger.debug(f"[.NET][unmanaged][abs] {refName} (norm={norm}) → no match") + + # Legacy behavior: absolute path means no probing/variants continue + # Probe directories from this software's installPath probedirs = [] if isinstance(software.installPath, Iterable): for ipath in software.installPath: probedirs.append(pathlib.PureWindowsPath(ipath).parent.as_posix()) - # Construct a list of combinations specified in (2.a) + logger.debug(f"[.NET][unmanaged] probedirs for {refName}: {probedirs}") + + # Build candidate filenames for unmanaged imports (legacy behavior) + # Construct a list of combinations specified in (2) # Refer to Issue #80 - Need to verify that this conforms with cross-platform behavior + logger.debug(f"[.NET][unmanaged] resolving import: {refName}") combinations = [refName] if not (refName.endswith(".dll") or refName.endswith(".exe")): combinations.append(f"{refName}.dll") @@ -116,26 +411,28 @@ def establish_relationships( f"lib{refName}", ] ) + logger.debug(f"[.NET][unmanaged] candidates for {refName}: {combinations}") + + found = False # On Linux, if the libname ends with .so or has .so. then version variations are tried # Refer to Issue #79 - Need regex matching for version variations for e in find_installed_software(sbom, probedirs, combinations): - dependency_uuid = e.UUID - relationships.append(Relationship(dependent_uuid, dependency_uuid, "Uses")) - - # https://learn.microsoft.com/en-us/dotnet/framework/deployment/how-the-runtime-locates-assemblies - # 1. Determine correct assembly version using configuration files (binding redirects, code location, etc) - # 2. Check if assembly name bound before; if it is use previously loaded assembly - # 3. Check global assembly cache (%windir%\Microsoft.NET\assembly in .NET framework 4, %windir%\assembly previously) - # 4. Probe for assembly: - # - a. Check for element in app config; check the given location and if assembly found great no probing; otherwise fail without probing - # - b. If there is no element, begin probing using - # - application base + culture + assembly name directories - # - privatePath directories from a probing element, combined with culture/appbase/assemblyname (done before the standard probing directories) - # - the location of the calling assembly may be used as a hint for where to find the referenced assembly + if e and e.UUID != dependent_uuid: + dependency_uuid = e.UUID + logger.debug(f"[.NET][unmanaged] {refName} → UUID={dependency_uuid}") + relationships.append(Relationship(dependent_uuid, dependency_uuid, "Uses")) + found = True + + if not found: + logger.debug(f"[.NET][unmanaged] {refName} → no match") + if "dotnetAssemblyRef" in metadata: + logger.debug( + f"[.NET][import] {software.UUID} importing {len(metadata['dotnetAssemblyRef'])} assemblies" + ) for asmRef in metadata["dotnetAssemblyRef"]: refName = None - refVersion = None + refVersion = None # This variable was declared in legacy but never used and is kept for potential future use refCulture = None if "Name" in asmRef: refName = asmRef["Name"] @@ -145,8 +442,13 @@ def establish_relationships( refCulture = asmRef["Culture"] if "Version" in asmRef: refVersion = asmRef["Version"] + logger.debug( + f"[.NET][import] resolving {refName} (version={refVersion}, culture={refCulture})" + ) + + fname_variants = [refName + ".dll"] - # check if codeBase element exists for this assembly in appconfig + # Check if codeBase element exists for this assembly in appconfig if dnDependentAssemblies is not None: for depAsm in dnDependentAssemblies: # dependent assembly object contains info on assembly id and binding redirects that with a better internal SBOM @@ -169,75 +471,151 @@ def establish_relationships( install_basepath = pathlib.PureWindowsPath( install_filepath ).parent.as_posix() - cb_filepath = pathlib.PureWindowsPath( + cb_fullpath = normalize_path( install_basepath, codebase_href ) - cb_file = cb_filepath.name - cb_path = [cb_filepath.parent.as_posix()] - for e in find_installed_software(sbom, cb_path, cb_file): - dependency_uuid = e.UUID + # 1) Graph-first: resolve via fs_tree + match = sbom.get_software_by_path( + cb_fullpath, case_insensitive=True + ) + if match and match.UUID != dependent_uuid: + logger.debug( + f"[.NET][codeBase] {codebase_href} → UUID={match.UUID} [graph]" + ) relationships.append( - Relationship( - dependent_uuid, - dependency_uuid, - "Uses", - ) + Relationship(dependent_uuid, match.UUID, "Uses") ) + else: + # 2) Legacy fallback: directory+filename scan (matches legacy behavior) + cb_filepath = pathlib.PureWindowsPath(cb_fullpath) + cb_file = cb_filepath.name + cb_path = [cb_filepath.parent.as_posix()] + + legacy_found = False + for e in find_installed_software( + sbom, cb_path, cb_file + ): + if e and e.UUID != dependent_uuid: + logger.debug( + f"[.NET][codeBase] {codebase_href} → UUID={e.UUID} [legacy_fallback]" + ) + relationships.append( + Relationship(dependent_uuid, e.UUID, "Uses") + ) + legacy_found = True + + if not legacy_found: + logger.debug( + f"[.NET][codeBase] {codebase_href} → no match" + ) + # --- Build probing dirs (legacy patterns + fs_tree) --- + # - base dir + # - base/refName + # - culture subdirs + # - privatePath combinations + # This reproduces legacy layout coverage, but we will still resolve + # through the fs_tree via sbom.get_software_by_path(). # continue on to probing even if codebase element was found, since we can't guarantee the assembly identity required by the codebase element # get the list of paths to probe based on locations software is installed, assembly culture, assembly name, and probing paths from appconfig file - probedirs = get_dotnet_probedirs(software, refCulture, refName, dnProbingPaths) - for e in find_installed_software(sbom, probedirs, refName + ".dll"): - dependency_uuid = e.UUID - relationships.append(Relationship(dependent_uuid, dependency_uuid, "Uses")) - # logging assemblies not found would be nice but is a lot of noise as it mostly just prints system/core .NET libraries - return relationships + probedirs: list[str] = [] + probedirs = get_dotnet_probedirs( + software=software, + refCulture=refCulture, + refName=refName, + dnProbingPaths=dnProbingPaths or None, + ) + logger.debug(f"[.NET][import] probing dirs for {refName}: {probedirs}") -def is_absolute_path(fname: str) -> bool: - givenpath = pathlib.PureWindowsPath(fname) - return givenpath.is_absolute() + matched_uuids = set() + used_method = {} + # def is_valid_match(sw: Software, refVersion=refVersion, refCulture=refCulture) -> bool: + # """ + # Apply identity-based filters to ensure that a candidate assembly + # truly corresponds to the referenced assembly. -# construct a list of directories to probe for establishing dotnet relationships -def get_dotnet_probedirs(software: Software, refCulture, refName, dnProbingPaths): - probedirs = [] - # probe for the referenced assemblies - if isinstance(software.installPath, Iterable): - for install_filepath in software.installPath: - install_basepath = pathlib.PureWindowsPath(install_filepath).parent.as_posix() - if refCulture is None or refCulture == "": - # [application base] / [assembly name].dll - # [application base] / [assembly name] / [assembly name].dll - probedirs.append(pathlib.PureWindowsPath(install_basepath).as_posix()) - probedirs.append(pathlib.PureWindowsPath(install_basepath, refName).as_posix()) - if dnProbingPaths is not None: - # add probing private paths - for path in dnProbingPaths: - # [application base] / [binpath] / [assembly name].dll - # [application base] / [binpath] / [assembly name] / [assembly name].dll - probedirs.append(pathlib.PureWindowsPath(install_basepath, path).as_posix()) - probedirs.append( - pathlib.PureWindowsPath(install_basepath, path, refName).as_posix() - ) - else: - # [application base] / [culture] / [assembly name].dll - # [application base] / [culture] / [assembly name] / [assembly name].dll - probedirs.append(pathlib.PureWindowsPath(install_basepath, refCulture).as_posix()) - probedirs.append( - pathlib.PureWindowsPath(install_basepath, refName, refCulture).as_posix() - ) - if dnProbingPaths is not None: - # add probing private paths - for path in dnProbingPaths: - # [application base] / [binpath] / [culture] / [assembly name].dll - # [application base] / [binpath] / [culture] / [assembly name] / [assembly name].dll - probedirs.append( - pathlib.PureWindowsPath(install_basepath, path, refCulture).as_posix() - ) - probedirs.append( - pathlib.PureWindowsPath( - install_basepath, path, refName, refCulture - ).as_posix() - ) - return probedirs + # A match is rejected when: + # • The candidate is the dependent software itself (avoid self-loops). + # • Version metadata exists on both sides and the versions differ. + # • Culture metadata exists on both sides and the cultures differ. + + # Only explicit mismatches are filtered out; if metadata is absent on + # either side, the function allows the match to proceed so that other + # phases may evaluate it. + # """ + # # Do not match the importing software to itself + # if sw.UUID == dependent_uuid: + # return False + + # # Check version and culture metadata when present + # for md in sw.metadata or []: + # asm = md.get("dotnetAssembly") + # if asm: + # sw_version = asm.get("Version") + # sw_culture = asm.get("Culture") + + # # Version mismatch + # if refVersion and sw_version and sw_version != refVersion: + # logger.debug( + # f"[.NET][filter] skipping {sw.UUID}: version {sw_version} ≠ {refVersion}" + # ) + # return False + + # # Culture mismatch + # if refCulture and sw_culture and sw_culture != refCulture: + # logger.debug( + # f"[.NET][filter] skipping {sw.UUID}: culture {sw_culture} ≠ {refCulture}" + # ) + # return False + # return True + + # Phase 1: fs_tree lookup + # + # Construct fully qualified candidate paths by combining each probing + # directory with each allowed filename variant. Each constructed path + # is resolved through sbom.get_software_by_path(), which uses the + # filesystem graph (fs_tree+ symlink edges) + # + # A match is accepted only when: + # • A software entry exists at the resolved path, and + # • It satisfies version and culture filters (is_valid_match). + # + # This phase provides the most precise form of resolution because it + # operates on concrete filesystem paths derived from .NET probing rules. + for probe_dir in sorted(set(probedirs)): + for fname in fname_variants: + path = normalize_path(probe_dir, fname) + match = sbom.get_software_by_path(path, case_insensitive=True) + # ok = bool(match and is_valid_match(match)) + logger.debug( + f"[.NET][fs_tree] {path} → {'UUID=' + match.UUID if match else 'no match'}" + ) + if match and match.UUID != dependent_uuid: + matched_uuids.add(match.UUID) + used_method[match.UUID] = "fs_tree" + + # Phase 2: Legacy probe + if not matched_uuids: + for e in find_installed_software(sbom, probedirs, refName + ".dll"): + if e.UUID != dependent_uuid: + logger.debug(f"[.NET][legacy_phase2] {refName} → UUID={e.UUID}") + matched_uuids.add(e.UUID) + used_method[e.UUID] = "legacy_full_scan" + + # Phase 3: Finalize relationships + for uuid in matched_uuids: + rel = Relationship(dependent_uuid, uuid, "Uses") + if rel not in relationships: + method = used_method.get(uuid, "unknown") + logger.debug( + f"[.NET][final] {dependent_uuid} Uses {refName} → UUID={uuid} [{method}]" + ) + relationships.append(rel) + # logging assemblies not found would be nice but is a lot of noise as it mostly just prints system/core .NET libraries + + if not matched_uuids: + logger.debug(f"[.NET][final] {dependent_uuid} Uses {refName} → no match") + + return relationships diff --git a/surfactant/relationships/dotnet_relationship_legacy.py b/surfactant/relationships/dotnet_relationship_legacy.py new file mode 100644 index 000000000..40827320a --- /dev/null +++ b/surfactant/relationships/dotnet_relationship_legacy.py @@ -0,0 +1,243 @@ +# Copyright 2023 Lawrence Livermore National Security, LLC +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT +import pathlib +from collections.abc import Iterable +from typing import List, Optional + +import surfactant.plugin +from surfactant.sbomtypes import SBOM, Relationship, Software + +from ._internal.windows_utils import find_installed_software + + +def has_required_fields(metadata) -> bool: + # dotnetAssemblyRef must present, otherwise we have no info on .NET imports + return "dotnetAssemblyRef" in metadata + + +@surfactant.plugin.hookimpl +def establish_relationships( + sbom: SBOM, software: Software, metadata +) -> Optional[List[Relationship]]: + if not has_required_fields(metadata): + return None + + relationships: List[Relationship] = [] + dependent_uuid = software.UUID + dnName = None + dnCulture = None + dnVersion = None + if "dotnetAssembly" in metadata: + dnAssembly = metadata["dotnetAssembly"] + if "Name" in dnAssembly: + dnName = dnAssembly["Name"] + if "Culture" in dnAssembly: + dnCulture = dnAssembly["Culture"] + if "Version" in dnAssembly: + dnVersion = dnAssembly["Version"] + + # get additional probing paths if they exist + dnProbingPaths = None + dnDependentAssemblies = None + + windowsAppConfig = None + windowsManifest = None + if "appConfigFile" in metadata: + windowsAppConfig = metadata["appConfigFile"] + if "manifestFile" in metadata: + windowsManifest = metadata["manifestFile"] + + if windowsAppConfig: + if "runtime" in windowsAppConfig: + wac_runtime = windowsAppConfig["runtime"] + if "assemblyBinding" in wac_runtime: + wac_asmbinding = wac_runtime["assemblyBinding"] + if "dependentAssembly" in wac_asmbinding: + dnDependentAssemblies = wac_asmbinding["dependentAssembly"] + if "probing" in wac_asmbinding: + wac_probing = wac_asmbinding["probing"] + if "privatePath" in wac_probing: + wac_paths = wac_probing["privatePath"] + for path in wac_paths.split(";"): + if dnProbingPaths is None: + dnProbingPaths = [] + dnProbingPaths.append(pathlib.PureWindowsPath(path).as_posix()) + + # https://learn.microsoft.com/en-us/dotnet/core/dependency-loading/loading-unmanaged + # 1. Check the active AssemblyLoadContext cache + # 2. Calling the import resolver set by the setDllImportResolver function + # - a. Example using SetDllImportResolver: https://learn.microsoft.com/en-us/dotnet/standard/native-interop/native-library-loading + # - b. Checks PInvoke's or Assembly's DefaultDllImportSearchPathsAttribute, then the assembly's directory, then LoadLibraryEx with LOAD_WITH_ALTERED_SEARCH_PATH flag + # - This attribute has no effect on non-Windows platforms / Mono runtime + # - https://learn.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.defaultdllimportsearchpathsattribute?view=net-7.0 + # - i. This has a "Paths" property which is a bitwise combination of paths specified in ii: + # - ii. https://learn.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.dllimportsearchpath?view=net-7.0 + # 3. The active AssemblyLoadContext calls its LoadUnmanagedDll function (Default behavior is the same as AssemblyRef probing?) + # - a. Can be overridden, but the default implementation returns IntPtr.Zero, which tells the runtime to load with its default policy. + # - b. https://learn.microsoft.com/en-us/dotnet/api/system.runtime.loader.assemblyloadcontext.loadunmanageddll?view=net-7.0 + # 4. Run default unmanaged library probing logic by parsing *.deps.json probing properties + # - a. If the json file isn't present, assume the calling assembly's directory contains the library + # - b. https://learn.microsoft.com/en-us/dotnet/core/dependency-loading/default-probing#unmanaged-native-library-probing + if "dotnetImplMap" in metadata: + for asmRef in metadata["dotnetImplMap"]: + if "Name" not in asmRef: + continue + refName = asmRef["Name"] + + # Check absolute path against entries in software + if is_absolute_path(refName): + ref_abspath = pathlib.PureWindowsPath(refName) + for e in sbom.software: + if e.installPath is None: + continue + if isinstance(e.installPath, Iterable): + for ifile in e.installPath: + if ref_abspath == pathlib.PureWindowsPath(ifile): + relationships.append(Relationship(dependent_uuid, e.UUID, "Uses")) + continue + + probedirs = [] + if isinstance(software.installPath, Iterable): + for ipath in software.installPath: + probedirs.append(pathlib.PureWindowsPath(ipath).parent.as_posix()) + # Construct a list of combinations specified in (2.a) + # Refer to Issue #80 - Need to verify that this conforms with cross-platform behavior + combinations = [refName] + if not (refName.endswith(".dll") or refName.endswith(".exe")): + combinations.append(f"{refName}.dll") + combinations.extend( + [ + f"{refName}.so", + f"{refName}.dylib", + f"lib{refName}.so", + f"lib{refName}.dylib", + f"lib{refName}", + ] + ) + # On Linux, if the libname ends with .so or has .so. then version variations are tried + # Refer to Issue #79 - Need regex matching for version variations + for e in find_installed_software(sbom, probedirs, combinations): + dependency_uuid = e.UUID + relationships.append(Relationship(dependent_uuid, dependency_uuid, "Uses")) + + # https://learn.microsoft.com/en-us/dotnet/framework/deployment/how-the-runtime-locates-assemblies + # 1. Determine correct assembly version using configuration files (binding redirects, code location, etc) + # 2. Check if assembly name bound before; if it is use previously loaded assembly + # 3. Check global assembly cache (%windir%\Microsoft.NET\assembly in .NET framework 4, %windir%\assembly previously) + # 4. Probe for assembly: + # - a. Check for element in app config; check the given location and if assembly found great no probing; otherwise fail without probing + # - b. If there is no element, begin probing using + # - application base + culture + assembly name directories + # - privatePath directories from a probing element, combined with culture/appbase/assemblyname (done before the standard probing directories) + # - the location of the calling assembly may be used as a hint for where to find the referenced assembly + if "dotnetAssemblyRef" in metadata: + for asmRef in metadata["dotnetAssemblyRef"]: + refName = None + refVersion = None + refCulture = None + if "Name" in asmRef: + refName = asmRef["Name"] + else: + continue # no name means we have no assembly to search for + if "Culture" in asmRef: + refCulture = asmRef["Culture"] + if "Version" in asmRef: + refVersion = asmRef["Version"] + + # check if codeBase element exists for this assembly in appconfig + if dnDependentAssemblies is not None: + for depAsm in dnDependentAssemblies: + # dependent assembly object contains info on assembly id and binding redirects that with a better internal SBOM + # representation could be used to also verify the right assembly is being found + if "codeBase" in depAsm: + if "href" in depAsm["codeBase"]: + codebase_href = depAsm["codeBase"]["href"] + # strong named assembly can be anywhere on intranet or Internet + if ( + codebase_href.startswith("http://") + or codebase_href.startswith("https://") + or codebase_href.startswith("file://") + ): + # codebase references a url; interesting for manual analysis/gathering additional files, but not supported by surfactant yet + pass + else: + # most likely a private assembly, so path must be relative to application's directory + if isinstance(software.installPath, Iterable): + for install_filepath in software.installPath: + install_basepath = pathlib.PureWindowsPath( + install_filepath + ).parent.as_posix() + cb_filepath = pathlib.PureWindowsPath( + install_basepath, codebase_href + ) + cb_file = cb_filepath.name + cb_path = [cb_filepath.parent.as_posix()] + for e in find_installed_software(sbom, cb_path, cb_file): + dependency_uuid = e.UUID + relationships.append( + Relationship( + dependent_uuid, + dependency_uuid, + "Uses", + ) + ) + + # continue on to probing even if codebase element was found, since we can't guarantee the assembly identity required by the codebase element + # get the list of paths to probe based on locations software is installed, assembly culture, assembly name, and probing paths from appconfig file + probedirs = get_dotnet_probedirs(software, refCulture, refName, dnProbingPaths) + for e in find_installed_software(sbom, probedirs, refName + ".dll"): + dependency_uuid = e.UUID + relationships.append(Relationship(dependent_uuid, dependency_uuid, "Uses")) + # logging assemblies not found would be nice but is a lot of noise as it mostly just prints system/core .NET libraries + return relationships + + +def is_absolute_path(fname: str) -> bool: + givenpath = pathlib.PureWindowsPath(fname) + return givenpath.is_absolute() + + +# construct a list of directories to probe for establishing dotnet relationships +def get_dotnet_probedirs(software: Software, refCulture, refName, dnProbingPaths): + probedirs = [] + # probe for the referenced assemblies + if isinstance(software.installPath, Iterable): + for install_filepath in software.installPath: + install_basepath = pathlib.PureWindowsPath(install_filepath).parent.as_posix() + if refCulture is None or refCulture == "": + # [application base] / [assembly name].dll + # [application base] / [assembly name] / [assembly name].dll + probedirs.append(pathlib.PureWindowsPath(install_basepath).as_posix()) + probedirs.append(pathlib.PureWindowsPath(install_basepath, refName).as_posix()) + if dnProbingPaths is not None: + # add probing private paths + for path in dnProbingPaths: + # [application base] / [binpath] / [assembly name].dll + # [application base] / [binpath] / [assembly name] / [assembly name].dll + probedirs.append(pathlib.PureWindowsPath(install_basepath, path).as_posix()) + probedirs.append( + pathlib.PureWindowsPath(install_basepath, path, refName).as_posix() + ) + else: + # [application base] / [culture] / [assembly name].dll + # [application base] / [culture] / [assembly name] / [assembly name].dll + probedirs.append(pathlib.PureWindowsPath(install_basepath, refCulture).as_posix()) + probedirs.append( + pathlib.PureWindowsPath(install_basepath, refName, refCulture).as_posix() + ) + if dnProbingPaths is not None: + # add probing private paths + for path in dnProbingPaths: + # [application base] / [binpath] / [culture] / [assembly name].dll + # [application base] / [binpath] / [culture] / [assembly name] / [assembly name].dll + probedirs.append( + pathlib.PureWindowsPath(install_basepath, path, refCulture).as_posix() + ) + probedirs.append( + pathlib.PureWindowsPath( + install_basepath, path, refName, refCulture + ).as_posix() + ) + return probedirs diff --git a/surfactant/relationships/elf_relationship.py b/surfactant/relationships/elf_relationship.py index 74f069ce8..0a4231065 100644 --- a/surfactant/relationships/elf_relationship.py +++ b/surfactant/relationships/elf_relationship.py @@ -6,6 +6,8 @@ from collections.abc import Iterable from typing import List, Optional +from loguru import logger + import surfactant.plugin from surfactant.sbomtypes import SBOM, Relationship, Software @@ -59,41 +61,54 @@ def establish_relationships( sbom: SBOM, software: Software, metadata ) -> Optional[List[Relationship]]: """ - Establish relationships between a software item and its dependencies. + Establish `Uses` relationships between a software item and its ELF-declared + dependencies. + + This function resolves dependencies listed in `metadata["elfDependencies"]` + by computing the set of possible runtime file paths for each dependency and + attempting to match them against software entries in the SBOM. Resolution + proceeds in two stages: + + 1. **fs_tree-based lookup** + Fully-qualified dependency paths (absolute or constructed from + runpaths, RPATH, RUNPATH, and $ORIGIN substitutions) are checked + against the SBOM filesystem graph. This lookup is symlink-aware. - This function processes metadata to identify software dependencies and their - corresponding relationships. It examines `metadata` for ELF dependencies, - determines possible file paths for the dependencies, and searches the SBOM - (Software Bill of Materials) for entries that match these dependencies. + 2. **Legacy installPath fallback** + If no fs_tree match is found, the function falls back to a strict + legacy rule: a candidate software entry must advertise the same + filename *and* must contain an exact installPath equal to one of the + computed dependency paths. + + No heuristic directory-level inference is performed. Dependencies are only + matched when the SBOM contains a resolvable path or an exact legacy + installPath entry. Args: - sbom (SBOM): The software bill of materials, containing data about the available software. - software (Software): The software entity for which relationships are being established. - metadata: Metadata providing details about the software dependencies. Must contain - the "elfDependencies" field to describe ELF-based dependencies. + sbom (SBOM): The software bill of materials containing all discovered + software entries and the filesystem graph. + software (Software): The software entry whose dependencies are being + resolved. + metadata: Metadata extracted from the ELF file. Must contain the + `"elfDependencies"` field to participate in relationship inference. Returns: - Optional[List[Relationship]]: A list of `Relationship` objects representing dependencies - between the specified software and other software items in the SBOM. If the required - fields in `metadata` are missing, `None` is returned. - - Raises: - None + Optional[List[Relationship]]: + A list of `Relationship(dependent, dependency, "Uses")` entries. + Returns `None` if the metadata does not contain `elfDependencies`. Notes: - - The function uses `metadata["elfDependencies"]` to locate dependencies described - as ELF paths or filenames. - - Relative paths in metadata are normalized and matched against installation paths - of the candidate software entries. - - Dependency file paths are cross-referenced with `sbom.software` entries to establish - their relationships. - - Returned `Relationship` objects are unique: no duplicates are added to the result list. + - Dependency paths may originate from: + • absolute paths in metadata + • relative paths joined against each installPath + • computed runpaths (RPATH/RUNPATH) and default library paths + - Duplicate relationships are suppressed. + - Self-dependencies are not emitted. Example: relationships = establish_relationships(sbom, software, metadata) - if relationships: - for relationship in relationships: - print(f"{relationship.dependent_uuid} uses {relationship.dependency_uuid}") + for rel in relationships or []: + print(f"{rel.xUUID} uses {rel.yUUID}") """ if not has_required_fields(metadata): return None @@ -101,51 +116,145 @@ def establish_relationships( relationships: List[Relationship] = [] dependent_uuid = software.UUID default_search_paths = generate_search_paths(software, metadata) + logger.debug(f"[ELF][search] default paths: {[p.as_posix() for p in default_search_paths]}") + + # Each entry in metadata["elfDependencies"] is a DT_NEEDED-style string + # extracted from the ELF dynamic section. These can be: + # • Bare filenames, e.g. "libc.so.6" + # • Relative paths, e.g. "subdir/libfoo.so" + # • Absolute paths, e.g. "/opt/lib/libbar.so" + # + # The dynamic loader interprets these differently, and the resolution logic + # below mirrors that behavior. We normalize each dependency into candidate + # full paths ("fpaths") that represent where the loader *could* reasonably + # locate the referenced library within the captured filesystem snapshot. + # + # Notes: + # - posix_normpath() removes "..", ".", and duplicate separators to keep + # path comparisons consistent with fs_tree key normalization. + # - No directory inference or fuzzy matching occurs: all candidate paths + # must be derived strictly from ELF semantics (slashes vs. bare names) + # and from the calling software’s installPath or runpath metadata. for dep in metadata["elfDependencies"]: - # if dependency has a slash, it is interpreted as a pathname to shared object to load - # construct fname and full file path(s) to search for; paths must be a list if the dependency is given as a relative path - if "/" in dep: - # search SBOM entries for a library at a matching relative/absolute path - dep = posix_normpath( - dep - ) # normpath takes care of redundancies such as `//`->`/` and `ab/../xy`->`xy`; NOTE may change meaning of path containing symlinks - fname = dep.name + dep_str = dep + fpaths = [] + dep = posix_normpath(dep_str) + fname = dep.name # e.g., 'libfoo.so' + + # Determine all candidate filesystem paths where this dependency *might* + # reside. ELF dependency strings come in two forms: + # + # Case 1: The string contains a slash (e.g., "somedir/libfoo.so") + # → The dynamic loader interprets this as a literal path. + # - If absolute, the path is used directly. + # - If relative, it is resolved relative to each installPath + # of the referencing software. This mirrors runtime behavior + # where a DT_NEEDED entry containing "subdir/libX.so" is + # interpreted relative to the binary's directory. + # + # Case 2: Bare filename (e.g., "libfoo.so") + # → The loader searches a sequence of directories: + # runpaths, rpaths, LD_LIBRARY_PATH, cache entries, + # and system defaults. Here we approximate that search by + # joining the filename onto the generated search paths + # (generate_search_paths), which already expands RPATH, + # RUNPATH, $ORIGIN, and default library paths unless disabled. + # + # Together, these two branches generate the list `fpaths`, representing + # all plausible full paths that could correspond to this dependency. + # Later phases attempt to match these paths against the SBOM via fs_tree + # or exact installPath equality. + + # Case 1: Dependency has slash — treat as direct path + if "/" in dep_str: if dep.is_absolute(): - # absolute path - fpaths = [str(dep)] + fpaths = [dep.as_posix()] else: - # relative path - fpaths = [] - # iterate through install paths for sw to get the full path to the file as it would appear in installPaths for the software entry if isinstance(software.installPath, Iterable): for ipath in software.installPath: - ipath_posix = posix_normpath( - ipath - ) # NOTE symlinks in install path may be affected by normpath - fpaths.append( - posix_normpath(str(ipath_posix.parent.joinpath(dep))).as_posix() - ) # paths to search are install path folders + relative path of dependency + ipath_posix = posix_normpath(ipath) + combined = posix_normpath(str(ipath_posix.parent.joinpath(dep))).as_posix() + fpaths.append(combined) + + # Case 2: Bare filename — use runpaths and fallback paths + else: + fpaths = [p.joinpath(fname).as_posix() for p in default_search_paths] + + # Phase 1: fs_tree lookup + # + # Attempt to resolve each dependency path directly against the SBOM's + # filesystem graph. This is the most accurate resolution method and + # mirrors how the file would be found at runtime: + # + # - Paths are normalized and checked for a software UUID attached to + # the corresponding fs_tree node. + # - If the exact node is not a software installPath, the resolver + # follows recorded filesystem symlinks (file- and directory-level) + # using BFS until a software entry is reached. + # + # This phase yields only concrete, evidence-based matches—no inference. + # A match is accepted only if: + # - The target resolves through the fs_tree to a software entry, AND + # - The dependency is not self-referential. + matched_uuids = set() + used_method = {} + + for path in fpaths: + match = sbom.get_software_by_path(path) + ok = bool(match and match.UUID != software.UUID) + logger.debug(f"[ELF][fs_tree] {path} → {'UUID=' + match.UUID if ok else 'no match'}") + if ok: + matched_uuids.add(match.UUID) + used_method[match.UUID] = "fs_tree" + + # Phase 2: Legacy installPath fallback + # + # This stage preserves the historical (pre-fs_tree) resolution behavior: + # a dependency is considered matched only if: + # - the candidate software entry advertises the same fileName, AND + # - one of its installPath entries exactly matches one of the fully + # constructed dependency paths in `fpaths`. + # + # Unlike the fs_tree lookup, this method does NOT resolve symlinks, + # It requires a literal, explicit installPath match. This ensures: + # - No directory-level inference (removed with Phase 3), + # - No fallback guesses when the SBOM lacks a concrete path, + # - Behavior consistent with the older relationship engine. + # + # This phase only runs if fs_tree matching failed. + if not matched_uuids: + # Look for a software entry with a file name and install path that matches the dependency that would be loaded + for item in sbom.software: + # Check if the software entry has a name matching the dependency first as a quick check to rule out non-matches + if isinstance(item.fileName, Iterable) and fname not in item.fileName: + continue + + # Check for exact installPath equivalence with any computed dep path + for fp in fpaths: + if isinstance(item.installPath, Iterable) and fp in (item.installPath or []): + # software matching requirements to be the loaded dependency was found + if item.UUID != software.UUID: + dependency_uuid = item.UUID + logger.debug(f"[ELF][legacy] {fname} in {fp} → UUID={item.UUID}") + matched_uuids.add(dependency_uuid) + used_method[dependency_uuid] = "legacy_installPath" + + # Emit final relationships + if matched_uuids: + for dependency_uuid in matched_uuids: + if dependency_uuid == software.UUID: + continue + rel = Relationship(dependent_uuid, dependency_uuid, "Uses") + if rel not in relationships: + relationships.append(rel) + method = used_method.get(dependency_uuid, "unknown") + logger.debug( + f"[ELF][final] {dependent_uuid} Uses {fname} → UUID={dependency_uuid} [{method}]" + ) else: - fname = dep - # the paths for the dependency follow the default search path order for Linux/FreeBSD/etc - fpaths = [ - p.joinpath(fname).as_posix() for p in default_search_paths - ] # append fname to the end of the paths to get the full file install paths of the dependency - - # Look for a software entry with a file name and install path that matches the dependency that would be loaded - for item in sbom.software: - # Check if the software entry has a name matching the dependency first as a quick check to rule out non-matches - if isinstance(item.fileName, Iterable) and fname not in item.fileName: - continue - - # check if the software entry is installed to one of the paths looked at for loading the dependency - for fp in fpaths: - if isinstance(item.installPath, Iterable) and fp in item.installPath: - # software matching requirements to be the loaded dependency was found - dependency_uuid = item.UUID - rel = Relationship(dependent_uuid, dependency_uuid, "Uses") - if rel not in relationships: - relationships.append(rel) + logger.debug(f"[ELF][final] {dependent_uuid} Uses {fname} → no match") + + logger.debug(f"[ELF][final] emitted {len(relationships)} relationships") return relationships @@ -180,76 +289,27 @@ def generate_search_paths(sw: Software, md) -> List[pathlib.PurePosixPath]: # 1. Search using directories in DT_RPATH if present and no DT_RUNPATH exists (use of DT_RPATH is deprecated) # 2. Use LD_LIBRARY_PATH environment variable; ignore if suid/sgid binary (nothing to do, we don't have this information w/o running on a live system) # 3. Search using directories in DT_RUNPATH if present - paths = generate_runpaths(sw, md) # will return an empty list if none + paths = generate_runpaths(sw, md) # May include $ORIGIN etc., already substituted + # Check for the DF_1_NODEFLIB dynamic flag: disables default library search # 4. From /etc/ld.so.cache (/var/run/ld.so.hints on FreeBSD) list of compiled candidate libraries previously found in augmented library path; if binary was linked with -z nodeflib linker option, libraries in default library paths are skipped # /etc/ld.so.conf can be used to add additional directories to defaults (e.g. /usr/local/lib or /opt/lib), but we don't necessarily have a way to gather this info # Search in default path /lib, then /usr/lib; skip if binary was linked with -z nodeflib option nodeflib = False - if "elfDynamicFlags1" in md: - if "DF_1_NODEFLIB" in md["elfDynamicFlags1"]: - nodeflib = md["elfDynamicFlags1"]["DF_1_NODEFLIB"] + if "elfDynamicFlags1" in md and "DF_1_NODEFLIB" in md["elfDynamicFlags1"]: + nodeflib = md["elfDynamicFlags1"]["DF_1_NODEFLIB"] + + # If DF_1_NODEFLIB is not set, include default system paths if not nodeflib: - # add default search paths - paths.extend( - [pathlib.PurePosixPath(p) for p in ["/lib", "/lib64", "/usr/lib", "/usr/lib64"]] - ) + defaults = ["/lib", "/lib64", "/usr/lib", "/usr/lib64"] + logger.debug(f"[ELF][runpath] DF_1_NODEFLIB not set; adding defaults: {defaults}") + paths.extend([pathlib.PurePosixPath(p) for p in defaults]) - return paths + # Ensure all entries are PurePosixPath objects (in case runpaths included strings) + return [p if isinstance(p, pathlib.PurePosixPath) else pathlib.PurePosixPath(p) for p in paths] def generate_runpaths(sw: Software, md) -> List[pathlib.PurePosixPath]: - """ - Generate a list of resolved runpaths based on the metadata from - an ELF file and the provided software object. - - This function determines the appropriate runpath entries by analyzing - DT_RPATH and DT_RUNPATH from ELF metadata (`md`) and substitutes - dynamic string tokens (DSTs) to produce formatted paths. - - The logic follows these rules: - 1. If `elfRpath` is present in the metadata and `elfRunpath` is not, - the function uses `elfRpath` as the source of runpaths. Note that - the use of DT_RPATH is deprecated. - 2. If `elfRunpath` exists, it takes precedence and the function uses - `elfRunpath` as the source of runpath. - 3. Paths are split using `:` as a separator, and empty path components - are ignored. - 4. All paths perform DST substitution using the - `substitute_all_dst()` function. - - Args: - sw (Software): An object containing dependency and installation information, where - the software path can be iterated on through all runpath entries. - md: ELF metadata containing key-values such as `elfRpath` - and `elfRunpath`. - - Returns: - List[pathlib.PurePosixPath]: A list of finalized runpaths where - all dynamic string tokens are resolved. Each path is represented - as a `pathlib.PurePosixPath` object. - - Example: - Suppose `md` contains ELF metadata with the following entries: - ``` - >>>md = { - >>>"elfRpath": ["/lib:/usr/lib"], - >>>"elfRunpath": None, - } - [ - PurePosixPath('/lib'), - PurePosixPath('/usr/lib') - ] - ``` - And `sw` enables substitution tokens such as `$LIB`. - The function will return resolved paths by splitting `"/lib:/usr/lib"` - and applying substitutions where `$LIB` is located. - - Notes: - - If the ELF file specifies both `DT_RPATH` and `DT_RUNPATH`, - `DT_RUNPATH` is given precedence. - """ - # rpath and runpath are lists of strings (just in case an ELF file has several, though that is probably an invalid ELF file) rp_to_use = [] rpath = None @@ -266,8 +326,7 @@ def generate_runpaths(sw: Software, md) -> List[pathlib.PurePosixPath]: elif runpath: rp_to_use = runpath - # split up the paths first, then substitute DSTs - return [ + results = [ sp # append path with DSTs replaced to the list for rp in rp_to_use # iterate through all possible runpath entries for p in rp.split(":") # iterate through all components (paths) in each runpath entry @@ -275,6 +334,9 @@ def generate_runpaths(sw: Software, md) -> List[pathlib.PurePosixPath]: for sp in substitute_all_dst(sw, md, p) # substitute DSTs in the path ] + logger.debug(f"[ELF][runpath] expanded: {results}") + return results + def replace_dst(origstr, dvar, newval) -> str: """ @@ -306,91 +368,96 @@ def replace_dst(origstr, dvar, newval) -> str: def substitute_all_dst(sw: Software, md, path) -> List[pathlib.PurePosixPath]: """ - Substitute dynamic string tokens in a file path with appropriate values. - - This function processes a given file path and substitutes dynamic string tokens - (e.g., `$ORIGIN`, `$LIB`, `${ORIGIN}`, `${LIB}`) with corresponding values derived - from the `Software` object `sw` and predefined substitutions (e.g., "lib", "lib64"). - The resulting normalized paths are returned as a list of `pathlib.PurePosixPath` objects. - - Args: - sw (Software): An object containing dependency and installation information, where - `sw.installPath` can be an iterable of installation paths. - md: Metadata that may be used to process the path - path: The file path containing dynamic linker placeholders. + Expands dynamic string tokens (DSTs) in ELF search paths like $ORIGIN, $LIB, $PLATFORM. + + Background and References: + -------------------------- + The dynamic linker (`ld.so`) performs these substitutions for several contexts: + - Environment variables: LD_LIBRARY_PATH, LD_PRELOAD, and LD_AUDIT + - Dynamic section tags: DT_NEEDED, DT_RPATH, DT_RUNPATH, DT_AUDIT, and DT_DEPAUDIT + - Arguments to ld.so: --audit, --library-path, and --preload + - Filename arguments to dlopen() and dlmopen() + + More details: + See the “Dynamic string tokens” section of: + https://man7.org/linux/man-pages/man8/ld.so.8.html + + Token behavior summary: + $ORIGIN / ${ORIGIN}: + Replaced with the absolute directory containing the program or shared object + (with symlinks resolved and no ../ or ./ components). + For SUID/SGID binaries, the resolved path must lie in a trusted directory. + References: + - glibc: elf/dl-load.c#L356-L357 + - glibc: elf/dl-load.c#L297-L316 + + $LIB / ${LIB}: + Expands to either "lib" or "lib64" depending on architecture + (e.g. x86-64 → lib64, x86-32 → lib). + + $PLATFORM / ${PLATFORM}: + Expands to the CPU type string (e.g. "x86_64"). + On some architectures this comes from AT_PLATFORM in the auxiliary vector. + Implementing full substitution would require target-specific enumeration + of possible platform values (from glibc or musl sources), which is nontrivial + and rarely used — similar to hardware capability (hwcaps) subfolder searching. + For now, such paths are discarded if unresolved. + + Returns a list of expanded paths; if no supported tokens are present, returns an empty list. + + Parameters: + sw (Software): The software object (used for $ORIGIN resolution). + md (dict): ELF metadata. + path (str): The raw path string to process. Returns: - List[pathlib.PurePosixPath]: A list of normalized paths with substitutions applied. - If `$PLATFORM` or `${PLATFORM}` placeholders are found in the input path, an empty list - is returned, as no substitution is currently implemented for the `PLATFORM` placeholder. + List[pathlib.PurePosixPath]: All normalized, substituted search paths. + """ + pathlist: List[pathlib.PurePosixPath] = [] - Raises: - ValueError: May be raised internally if any errors occur during path manipulation - (e.g., invalid path operations or substitutions). + has_origin = "$ORIGIN" in path or "${ORIGIN}" in path + has_lib = "$LIB" in path or "${LIB}" in path + has_platform = "$PLATFORM" in path or "${PLATFORM}" in path - Notes: - - If `$ORIGIN` or `${ORIGIN}` placeholders are present, the substitution uses the - parent directory of each path in `sw.installPath`. - - If `$LIB` or `${LIB}` placeholders are present, the substitution uses "lib" and "lib64". - This results in branching paths when combined with `$ORIGIN`. - - `$PLATFORM` or `${PLATFORM}` placeholders are currently unhandled, and thus result in an empty - returned list. - - The resulting paths undergo normalization via `posix_normpath`. + # ---------------------- + # PLATFORM not supported + # ---------------------- + if has_platform: + # No way to resolve this reliably (varies by CPU/platform). + # Returning empty disables unresolved PLATFORM paths. + return [] - Example: - >>> sw = Software(installPath=["/usr/bin/app", "/opt/tools"]) - >>> path = "/usr/lib/$ORIGIN/lib/$LIB" - >>> substitute_all_dst(sw, md, path) - [ - PurePosixPath('/usr/lib/usr/bin/lib/lib'), - PurePosixPath('/usr/lib/usr/bin/lib64'), - PurePosixPath('/usr/lib/opt/lib/lib'), - PurePosixPath('/usr/lib/opt/lib/lib64'), - ] - """ - # substitute any dynamic string tokens found; may result in multiple strings if different variants are possible - # replace $ORIGIN, ${ORIGIN}, $LIB, ${LIB}, $PLATFORM, ${PLATFORM} tokens - # places the dynamic linker does this expansion are: - # - environment vars: LD_LIBRARY_PATH, LD_PRELOAD, and LD_AUDIT - # - dynamic section tags: DT_NEEDED, DT_RPATH, DT_RUNPATH, DT_AUDIT, and DT_DEPAUDIT - # - arguments to ld.so: --audit, --library-path, and --preload - # - the filename arguments to dlopen and dlmopen - # more details in the `Dynamic string tokens` section of https://man7.org/linux/man-pages/man8/ld.so.8.html - pathlist: List[pathlib.PurePosixPath] = [] - # ORIGIN: replace with absolute directory containing the program or shared object (with symlinks resolved and no ../ or ./ subfolders) - # for SUID/SGID binaries, after expansion the normalized path must be in a trusted directory (https://github.com/bminor/glibc/blob/0d41182/elf/dl-load.c#L356-L357, https://github.com/bminor/glibc/blob/0d41182/elf/dl-load.c#L297-L316) - if (path.find("$ORIGIN") != -1) or (path.find("${ORIGIN}") != -1): - if isinstance(sw.installPath, Iterable): - for ipath in sw.installPath: - origin = pathlib.PurePosixPath(ipath).parent.as_posix() - pathlist.append(pathlib.PurePosixPath(replace_dst(path, "ORIGIN", origin))) - - # LIB: expands to `lib` or `lib64` depending on arch (x86-64 to lib64, x86-32 to lib) - if (path.find("$LIB") != -1) or (path.find("${LIB}") != -1): + # No tokens → keep path as-is + if not (has_origin or has_lib): + return [posix_normpath(path)] + + # ---------------------- + # ORIGIN token expansion + # ---------------------- + if has_origin and isinstance(sw.installPath, Iterable): + for ipath in sw.installPath: + origin = pathlib.PurePosixPath(ipath).parent.as_posix() + pathlist.append(pathlib.PurePosixPath(replace_dst(path, "ORIGIN", origin))) + + # ------------------ + # LIB token expansion + # ------------------ + if has_lib: if not pathlist: # nothing in the original pathlist, use the original path passed in pathlist.append(pathlib.PurePosixPath(replace_dst(path, "LIB", "lib"))) pathlist.append(pathlib.PurePosixPath(replace_dst(path, "LIB", "lib64"))) else: - # perform substitutions with every current entry in pathlist pathlist = [ newp for p in pathlist for newp in ( - pathlib.PurePosixPath(replace_dst(p, "LIB", "lib")), - pathlib.PurePosixPath(replace_dst(p, "LIB", "lib64")), + pathlib.PurePosixPath(replace_dst(p.as_posix(), "LIB", "lib")), + pathlib.PurePosixPath(replace_dst(p.as_posix(), "LIB", "lib64")), ) ] - # PLATFORM: expands to string corresponding to CPU type of the host system (e.g. "x86_64") - # some archs the string comes from AT_PLATFORM value in auxiliary vector (getauxval) - if (path.find("$PLATFORM") != -1) or (path.find("${PLATFORM}") != -1): - # NOTE consider using what is known about the target CPU of the ELF binary, and get all possible PLATFORM values based on that from glibc/muslc source code? - # this would take some significant amount of searching (inconsistent in how different platforms set the value), and could result in a large increase in - # the number of search paths for a feature that is rarely used (similar to hwcaps subfolder searching) - # For now, discard paths given that no valid substitution was found - return [] - - # normalize paths after expanding tokens to avoid portions of the path involving ../, ./, and // occurrences - pathlist = [posix_normpath(p.as_posix()) for p in pathlist] - return pathlist + # ------------------------- + # Normalize all paths + # ------------------------- + return [posix_normpath(p.as_posix()) for p in pathlist] diff --git a/surfactant/relationships/java_relationship.py b/surfactant/relationships/java_relationship.py index 46b4065d7..657034855 100644 --- a/surfactant/relationships/java_relationship.py +++ b/surfactant/relationships/java_relationship.py @@ -1,51 +1,141 @@ from typing import Dict, List, Optional +from loguru import logger + import surfactant.plugin from surfactant.sbomtypes import SBOM, Relationship, Software def has_required_fields(metadata) -> bool: + """ + Check whether the metadata includes Java class information. + """ return "javaClasses" in metadata class _ExportDict: - created = False supplied_by: Dict[str, str] = {} @classmethod def create_export_dict(cls, sbom: SBOM): - if cls.created: - return + """ + Build a map from exported class name → supplier UUID. + + This mirrors the behavior of java_relationship_legacy._ExportDict, + but is rebuilt per-SBOM to avoid leaking state across calls/tests. + """ + cls.supplied_by = {} for software_entry in sbom.software: - if software_entry.metadata: - for metadata in software_entry.metadata: - if isinstance(metadata, Dict) and "javaClasses" in metadata: - for class_info in metadata["javaClasses"].values(): - for export in class_info["javaExports"]: - cls.supplied_by[export] = software_entry.UUID - cls.created = True + if not software_entry.metadata: + continue + for metadata in software_entry.metadata: + if not isinstance(metadata, dict): + continue + java_classes = metadata.get("javaClasses") + if not java_classes: + continue + for class_info in java_classes.values(): + for export in class_info.get("javaExports", []): + cls.supplied_by[export] = software_entry.UUID @classmethod - def get_supplier(cls, import_name: str) -> Optional[str]: - if import_name in cls.supplied_by: - return cls.supplied_by[import_name] - return None + def get_supplier(cls, export: str) -> Optional[str]: + return cls.supplied_by.get(export) @surfactant.plugin.hookimpl def establish_relationships( sbom: SBOM, software: Software, metadata ) -> Optional[List[Relationship]]: + """ + SurfActant plugin: Establish 'Uses' relationships for Java class-level imports. + + Resolution phases: + 1. [fs_tree] Exact path match using fs_tree. + 2. [legacy] installPath + fileName match. + + Args: + sbom (SBOM): The SBOM object containing all software entries and path graphs. + software (Software): The software entry declaring Java class dependencies. + metadata (dict): Metadata containing 'javaClasses' with import/export info. + + Returns: + Optional[List[Relationship]]: List of `Uses` relationships, or None if not applicable. + """ if not has_required_fields(metadata): + logger.debug(f"[Java][skip] No javaClasses metadata for UUID={software.UUID}") return None + + # Build legacy export dict once per process (no-op if already built) _ExportDict.create_export_dict(sbom) - relationships = [] - dependant_uuid = software.UUID - for class_info in metadata["javaClasses"].values(): - for import_ in class_info["javaImports"]: - if supplier_uuid := _ExportDict.get_supplier(import_): - if supplier_uuid != dependant_uuid: - rel = Relationship(dependant_uuid, supplier_uuid, "Uses") - if rel not in relationships: - relationships.append(rel) + + relationships: List[Relationship] = [] + dependent_uuid = software.UUID + java_classes = metadata["javaClasses"] + + # Collect imported class names + imports = {imp for cls in java_classes.values() for imp in cls.get("javaImports", [])} + logger.debug(f"[Java][import] {software.UUID} importing {len(imports)} classes") + + for import_class in imports: + class_path = class_to_path(import_class) + matched_uuids = set() + used_method = {} + + logger.debug(f"[Java][import] resolving {import_class} → {class_path}") + + # ------------------------------------------------------------------ + # Phase 1: fs_tree / path-based resolution + # ------------------------------------------------------------------ + # For each software entry, try to resolve the imported class path + # for ipath in software.installPath or []: + # # Normalize to a path and append the class_path + # base_dir = pathlib.PurePath(ipath).parent.as_posix() + # full_path = f"{base_dir}/{class_path}" + # match = sbom.get_software_by_path(full_path) + # ok = bool(match and match.UUID != dependent_uuid) + # logger.debug( + # f"[Java][fs_tree] {full_path} → {'UUID=' + match.UUID if ok else 'no match'}" + # ) + # if ok: + # matched_uuids.add(match.UUID) + # used_method[match.UUID] = "fs_tree" + + # ------------------------------------------------------------------ + # Phase 2 (backup): legacy export-dict behavior + # ------------------------------------------------------------------ + if not matched_uuids: + supplier_uuid = _ExportDict.get_supplier(import_class) + if supplier_uuid and supplier_uuid != dependent_uuid: + matched_uuids.add(supplier_uuid) + used_method[supplier_uuid] = "legacy_exports" + + # ----------------------------- + # Emit 'Uses' relationships + # ----------------------------- + if matched_uuids: + for uuid in matched_uuids: + if uuid == dependent_uuid: + continue + rel = Relationship(dependent_uuid, uuid, "Uses") + if rel not in relationships: + method = used_method.get(uuid, "unknown") + logger.debug( + f"[Java][final] {dependent_uuid} Uses {import_class} → UUID={uuid} [{method}]" + ) + relationships.append(rel) + else: + logger.debug(f"[Java][final] {dependent_uuid} Uses {import_class} → no match") + + logger.debug(f"[Java][final] emitted {len(relationships)} relationships") return relationships + + +def class_to_path(class_name: str) -> str: + """ + Convert a fully qualified Java class name to a relative path. + + Example: + "com.example.MyClass" → "com/example/MyClass.class" + """ + return f"{class_name.replace('.', '/')}.class" diff --git a/surfactant/relationships/pe_relationship.py b/surfactant/relationships/pe_relationship.py index 3b69384b7..15edb7307 100644 --- a/surfactant/relationships/pe_relationship.py +++ b/surfactant/relationships/pe_relationship.py @@ -4,75 +4,233 @@ # SPDX-License-Identifier: MIT import pathlib from collections.abc import Iterable -from typing import List, Optional +from typing import Any, List, Optional + +from loguru import logger import surfactant.plugin from surfactant.sbomtypes import SBOM, Relationship, Software - -from ._internal.windows_utils import find_installed_software +from surfactant.utils.paths import normalize_path -def has_required_fields(metadata) -> bool: - return "peImport" in metadata or "peBoundImport" in metadata or "peDelayImport" in metadata +def has_required_fields(metadata: dict[str, Any]) -> bool: + """Returns True if any known PE import fields are present in the metadata.""" + return any(k in metadata for k in ("peImport", "peBoundImport", "peDelayImport")) @surfactant.plugin.hookimpl def establish_relationships( - sbom: SBOM, software: Software, metadata + sbom: SBOM, software: Software, metadata: dict ) -> Optional[List[Relationship]]: + """ + SurfActant plugin: establish 'Uses' relationships based on PE import metadata. + + Handles peImport, peBoundImport, and peDelayImport using a Windows-specific resolver. + Phases: + 1. [fs_tree] Exact path match via sbom.get_software_by_path() + 2. [legacy] installPath + fileName matching + 3. [heuristic] fileName match + shared directory (symlink-aware) + """ if not has_required_fields(metadata): + logger.debug(f"[PE][skip] No PE import metadata for UUID={software.UUID} ({software.name})") return None - relationships = [] - if "peImport" in metadata: - # NOTE: UWP apps have their own search order for libraries; they use a .appx or .msix file extension and appear to be zip files, so our SBOM probably doesn't even include them - relationships.extend(get_windows_pe_dependencies(sbom, software, metadata["peImport"])) - if "peBoundImport" in metadata: - relationships.extend(get_windows_pe_dependencies(sbom, software, metadata["peBoundImport"])) - if "peDelayImport" in metadata: - relationships.extend(get_windows_pe_dependencies(sbom, software, metadata["peDelayImport"])) + relationships: List[Relationship] = [] + field_map = { + "peImport": "Direct", + "peBoundImport": "Bound", + "peDelayImport": "Delay", + } + + for field, label in field_map.items(): + if field in metadata: + entries = metadata[field] or [] + logger.debug( + f"[PE][import] {label} imports for {software.name} ({software.UUID}): {len(entries)}" + ) + relationships.extend(get_windows_pe_dependencies(sbom, software, entries)) + + logger.debug(f"[PE][final] emitted {len(relationships)} relationships") return relationships def get_windows_pe_dependencies(sbom: SBOM, sw: Software, peImports) -> List[Relationship]: + """ + Resolve dynamically loaded PE (Windows) DLL dependencies and generate ``Uses`` relationships. + + This function attempts dependency resolution in **two phases**, combining modern + SBOM graph/fs_tree capabilities with legacy fallbacks: + + 1. **Primary: Direct path resolution via ``sbom.fs_tree``** + Uses ``get_software_by_path()`` to match DLL names to concrete file locations, + following injected symlink metadata, directory symlink expansions, and + hash-equivalence links, using Windows-style case-insensitive matching for PE paths. + + 2. **Secondary: Legacy string-based resolution** + Falls back to case-insensitive matching of the DLL name against ``fileName`` and + then confirming that at least one ``installPath`` entry lies under a probed + parent directory *and* has a basename equal to the DLL name. + + Background and References + ------------------------- + This function models how Windows determines which DLLs a process loads when calling + ``LoadLibrary``, ``LoadLibraryEx``, or related APIs. It reconstructs the *searchable* + subset of the Windows loader's DLL search order using static information. + + Relevant documentation: + - Dynamic-link library search order: + https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order + - DLL redirection: + https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-redirection + - API sets overview: + https://learn.microsoft.com/en-us/windows/win32/apiindex/windows-apisets + + DLL Search Order (Desktop Applications) + --------------------------------------- + Windows uses a multi-stage search strategy influenced by directory context, + SafeDllSearchMode, manifests, KnownDLLs, LOAD_LIBRARY_SEARCH flags, PATH, and other + runtime settings. This implementation approximates only those behaviors that can be + resolved statically from the SBOM: + + - **Explicit or redirected paths** (e.g., LoadLibrary full paths, .local redirection) + - **Application directory search** + - **Directories implied by the importing file’s ``installPath``** + - **Name-based matching when directory information is limited** + + Features not modeled statically include: + - In-memory module reuse + - KnownDLLs registry lookup + - API set resolution (these DLLs are not files on disk) + - Manifest configuration, SxS isolation, and SafeDllSearchMode state + - ``PATH`` environment variable lookup + + API Sets + -------- + Windows 10/11 API set DLLs (e.g., ``api-ms-win-core-file-l1-1-0.dll``) are logical + contract names, not real files. They are always resolved internally by the Windows + loader and cannot be matched using filesystem-based analysis. + + Scope and Static Analysis Limitations + ------------------------------------- + Because this analysis cannot observe runtime information such as registry state, + loader flags, search path modifications, or process environment, resolution is + conservatively limited to: + + - The directory/paths associated with the importing software (``installPath``) + - Alias/symlink/hash-equivalent paths injected during SBOM generation + - Legacy name/directory matching when no direct fs_tree match exists + + Notes & Implementation Details + ------------------------------ + - If ``installPath`` is missing for the importing software, resolution is skipped. + Such files are often extracted intermediates or installer artifacts. + - TODO: add support for resolving DLLs using relative positions inside a + ``containerPath`` when multiple files derive from the same container UUID. + - Missing DLL logs are suppressed by default because they overwhelmingly correspond + to Windows system libraries that are intentionally not bundled with applications. + + Args: + sbom (SBOM): The SBOM containing software entries and the populated fs_tree. + sw (Software): The importing software item declaring DLL dependencies. + peImports (list[str]): Base names of imported DLLs (e.g., ``['KERNEL32.dll']``). + + Returns: + List[Relationship]: ``Uses`` relationships of the form + ``Relationship(xUUID=sw.UUID, yUUID=dep.UUID, relationship="Uses")``. + """ + relationships: List[Relationship] = [] + # No installPath is probably temporary files/installer # TODO maybe resolve dependencies using relative locations in containerPath, for files originating from the same container UUID? if sw.installPath is None: + logger.debug(f"[PE][skip] No installPath for {sw.name} ({sw.UUID}); skipping resolution") return relationships - # https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order - # Desktop Applications (we can only check a subset of these without much more info gathering, disassembly + full filesystem + environment details) - # 1. Specifying full path, using DLL redirection, or using a manifest - # - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-redirection - # - DLL redirection summary: redirection file with name_of_exe.local (contents are ignored) makes a check for mydll.dll happen in the application directory first, regardless of what the full path specified for LoadLibrary or LoadLibraryEx is (if no dll found in local directory, uses the typical search order) - # - manifest files cause any .local files to be ignored (also, enabling DLL redirection may require setting DevOverrideEnable registry key) - # 2. If DLL with same module name is loaded in memory, no search will happen. If DLL is in KnownDLLs registry key, it uses the system copy of the DLL instead of searching. - # 3. If LOAD_LIBRARY_SEARCH flags are set for LoadLibraryEx, it will search dir LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR, LOAD_LIBRARY_SEARCH_APPLICATION_DIR, paths explicitly added by AddDllDirectory (LOAD_LIBRARY_SEARCH_USER_DIRS) or the SetDllDirectory (multiple paths added have unspecified search order), then system directory (LOAD_LIBRARY_SEARCH_SYSTEM32) - # 4. Look in dir the app was loaded from (or specified by absolute path lpFileName if LoadLibraryEx is called with LOAD_WITH_ALTERED_SEARCH_PATH) - # 5. If SetDllDirectory function called with lpPathName: the directory specified - # 6. If SafeDllSearchMode is disabled: the current directory - # 7. Look in the system directory (GetSystemDirectory to get the path) - # 8. The 16-bit system directory (no function to get this directory; %windir%\SYSTEM on 32-bit systems, not supported on 64-bit systems) - # 9. Windows system directory (GetWindowsDirectory to get this path) - # 10. If SafeDllSearchMode is enabled (default): the current directory - # 11. Directories listed in PATH environment variable (per-application path in App Paths registry key is not used for searching) - - # In addition, Windows 10 + 11 add a feature called API sets: https://learn.microsoft.com/en-us/windows/win32/apiindex/windows-apisets - # these use special dll names that aren't actually a physical file on disk - - # Of those steps, without gathering much more information that is likely not available or manual/dynamic analysis, we can do: - # 4. Look for DLL in the directory the application was loaded from dependent_uuid = sw.UUID + for fname in peImports: + if not fname: + continue + + logger.debug(f"[PE][import] resolving '{fname}' for UUID={dependent_uuid}") + + matched_uuids = set() + used_method: dict[str, str] = {} + + # ----------------------------------- + # Phase 1: Direct fs_tree resolution + # ----------------------------------- probedirs = [] if isinstance(sw.installPath, Iterable): - for ipath in sw.installPath: - probedirs.append(pathlib.PureWindowsPath(ipath).parent.as_posix()) - # likely just one found, unless sw entry has the same file installed to multiple places - for e in find_installed_software(sbom, probedirs, fname): - dependency_uuid = e.UUID - relationships.append(Relationship(dependent_uuid, dependency_uuid, "Uses")) - # logging DLLs not found would be nice, but is excessively noisy due being almost exclusively system DLLs + for ipath in sw.installPath or []: + # Extract the parent directory in normalized POSIX form + parent_dir = pathlib.PureWindowsPath(ipath).parent.as_posix() + probedirs.append(parent_dir) + logger.debug(f"[PE][import] probedirs for '{fname}': {probedirs}") + + for directory in probedirs: + full_path = normalize_path(directory, fname) + match = sbom.get_software_by_path( + full_path, + case_insensitive=True, # Windows DLL resolution should be case-insensitive + ) + ok = bool(match and match.UUID != dependent_uuid) + logger.debug( + f"[PE][fs_tree] {full_path} → {'UUID=' + match.UUID if ok else 'no match'}" + ) + if ok: + matched_uuids.add(match.UUID) + used_method[match.UUID] = "fs_tree" + + # ---------------------------------------- + # Phase 2: Legacy installPath + fileName + # This only runs if Phase 1 (fs_tree / symlinks) finds no matches. + # ---------------------------------------- + if not matched_uuids: + fname_ci = fname.casefold() # normalize DLL name for case-insensitive matching + + for item in sbom.software: + # 1) Name match (case-insensitive) on fileName[] + if not isinstance(item.fileName, Iterable): + continue + + names_ci = { + n.casefold() for n in (item.fileName or []) if isinstance(n, str) + } # normalize declared file names for case-insensitive comparison + + if fname_ci not in names_ci: + continue # skip: software does not claim this DLL name + + # 2) Directory + basename match (case-insensitive) on installPath entries + if isinstance(item.installPath, Iterable): + for ipath in item.installPath or []: + win_path = pathlib.PureWindowsPath(ipath) + ip_dir = win_path.parent.as_posix() + ip_name_ci = win_path.name.casefold() + + if ip_dir in probedirs and ip_name_ci == fname_ci: + if item.UUID != dependent_uuid: + logger.debug(f"[PE][legacy] {fname} in {ipath} → UUID={item.UUID}") + matched_uuids.add(item.UUID) + used_method[item.UUID] = "legacy_installPath" + break # Stop checking more install paths for this item + + # ---------------------------------------- + # Emit final relationships (if any found) + # ---------------------------------------- + if matched_uuids: + for uuid in matched_uuids: + if uuid == dependent_uuid: + continue + rel = Relationship(dependent_uuid, uuid, "Uses") + if rel not in relationships: + method = used_method.get(uuid, "unknown") + logger.debug( + f"[PE][final] {dependent_uuid} Uses {fname} → UUID={uuid} [{method}]" + ) + relationships.append(rel) + else: + logger.debug(f"[PE][final] {dependent_uuid} Uses {fname} → no match") return relationships diff --git a/surfactant/sbomtypes/_sbom.py b/surfactant/sbomtypes/_sbom.py index b96e46562..0645706a0 100644 --- a/surfactant/sbomtypes/_sbom.py +++ b/surfactant/sbomtypes/_sbom.py @@ -2,17 +2,23 @@ # See the top-level LICENSE file for details. # # SPDX-License-Identifier: MIT +# pylint: disable=too-many-lines from __future__ import annotations import json +import pathlib import uuid as uuid_module +from collections import deque from dataclasses import asdict, dataclass, field, fields -from typing import Dict, List, Optional, Set +from pathlib import PurePosixPath +from typing import Dict, List, Optional, Set, Tuple import networkx as nx from dataclasses_json import config, dataclass_json from loguru import logger +from surfactant.utils.paths import basename_posix, normalize_path + from ._analysisdata import AnalysisData from ._file import File from ._hardware import Hardware @@ -41,6 +47,7 @@ def recover_serializers(cls): @dataclass_json @dataclass class SBOM: + # pylint: disable=too-many-public-methods # pylint: disable=R0902 systems: List[System] = field(default_factory=list) hardware: List[Hardware] = field(default_factory=list) @@ -56,6 +63,11 @@ class SBOM: observations: List[Observation] = field(default_factory=list) starRelationships: Set[StarRelationship] = field(default_factory=set) software_lookup_by_sha256: Dict = field(default_factory=dict) + fs_tree: nx.DiGraph = field( + init=False, + repr=False, + metadata=config(exclude=lambda _: True), + ) graph: nx.MultiDiGraph = field( init=False, repr=False, @@ -63,6 +75,23 @@ class SBOM: metadata=config(exclude=lambda _: True), ) # Add a NetworkX directed graph for quick traversal/query + _pending_dir_links: List[Tuple[str, str]] = field( + default_factory=list, + init=False, + repr=False, + metadata=config(exclude=lambda _: True), + ) + + # Deferred file-level symlinks (link_path, target_path, subtype) + # Queues symlink edges discovered before target nodes exist in fs_tree. + # Flushed later by expand_pending_file_symlinks() to ensure no links are lost. + _pending_file_links: List[Tuple[str, str, Optional[str]]] = field( + default_factory=list, + init=False, + repr=False, + metadata=config(exclude=lambda _: True), + ) + def __post_init__(self): # If called like SBOM(raw_dict), raw_dict will be in .systems if isinstance(self.systems, dict) and not self.hardware and not self.software: @@ -127,10 +156,254 @@ def __post_init__(self): k: v for k, v in self.__dataclass_fields__.items() if k not in INTERNAL_FIELDS } - # Build the NetworkX graph from systems/software and loaded relationships - self.build_graph() + # Build the Relationship graph from systems/software and loaded relationships + self.build_rel_graph() + + # Initialize fs_tree + self.fs_tree = nx.DiGraph() + + # Populate from installPaths (if present) + for sw in self.software: + self._add_software_to_fs_tree(sw) + + def _add_software_to_fs_tree(self, sw: "Software") -> None: + """ + Adds the install paths of a Software object to the SBOM's filesystem tree (fs_tree). + + This method normalizes each install path to POSIX format, constructs parent-child + directory edges, and attaches the software UUID as a node attribute at the final path. + + Args: + sw (Software): The software object whose install paths are to be added. + + Side Effects: + Modifies self.fs_tree (a NetworkX DiGraph) by: + - Creating parent-child edges for each path segment. + - Ensuring the full install path node exists. + - Tagging the final node with the software's UUID. + + Example: + For installPath = ["C:\\app\\bin"], this will create: + - Nodes: "C:", "C:/app", "C:/app/bin" + - Edges: "C:" → "C:/app", "C:/app" → "C:/app/bin" + - Node "C:/app/bin" will have attribute {"software_uuid": sw.UUID} + """ + if not sw.installPath: + return # Nothing to add if no install paths + + for path in sw.installPath: + # Normalize Windows or Unix paths to a consistent POSIX string + norm_path = normalize_path(path) + parts = pathlib.PurePosixPath(norm_path).parts + + # Build parent-child relationships for all intermediate directories + for i in range(1, len(parts)): + parent = normalize_path(*parts[:i]) + child = normalize_path(*parts[: i + 1]) + self.fs_tree.add_edge(parent, child) + + # Ensure the final node exists before assigning attributes + if not self.fs_tree.has_node(norm_path): + self.fs_tree.add_node(norm_path) + + # Associate this path node with the software UUID + self.fs_tree.nodes[norm_path]["software_uuid"] = sw.UUID + + # wire the file → hash edge so hash-equivalence works + if sw.sha256: + try: + self.record_hash_node(norm_path, sw.sha256) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning(f"[fs_tree] Failed to attach hash edge for {norm_path}: {e}") + + def get_software_by_path( + self, + path: str, + *, + case_insensitive: bool = False, + ) -> Optional[Software]: + """ + Retrieve a Software entry by normalized install path, using the fs_tree (with optional + case-insensitive fallback and symlink traversal). + + This function normalizes the provided path into POSIX format and first attempts an exact, + case-sensitive lookup in the filesystem graph (`fs_tree`). If the lookup succeeds and the + node contains a ``software_uuid``, the corresponding Software entry is returned. + + If the exact match fails and ``case_insensitive`` is enabled, a Windows-style + case-insensitive resolution is attempted: the function identifies the parent directory + of the requested path and scans its child nodes for a basename match using + case-folding. This preserves strict behavior for Unix/ELF callers while allowing PE + relationship resolvers to emulate Windows filesystem case-insensitivity. + + If neither the exact lookup nor the optional case-insensitive fallback finds a match, + the function performs a breadth-first traversal along outgoing symlink edges + (``type="symlink"``) beginning at the normalized path. The traversal continues until a + node containing a ``software_uuid`` is encountered or a depth limit is reached. - def build_graph(self) -> None: + Args: + path (str): + Raw input path (Windows or POSIX). Path separators are normalized but casing is + preserved. + case_insensitive (bool): + Enable Windows-style case-insensitive fallback matching when the exact lookup + fails. Defaults to ``False`` so that ELF/Unix resolution remains strictly + case-sensitive. + + Returns: + Optional[Software]: + The resolved Software entry if any resolution method succeeds, otherwise ``None``. + + Behavior Summary: + - Normalizes path separators to POSIX style (case preserved). + - Performs an exact, case-sensitive fs_tree lookup. + - Optionally performs case-insensitive basename matching within the parent directory + (Windows-specific behavior; opt-in only). + - Traverses outgoing symlink edges breadth-first, with cycle prevention. + - Applies a conservative depth cap to avoid pathological traversal. + - Returns the first reachable node containing ``software_uuid``. + """ + # Normalize the input path to POSIX format to match internal fs_tree representation + norm_path = normalize_path(path) + + # Attempt direct node lookup (case-sensitive, fast path) + node = self.fs_tree.nodes.get(norm_path) + if node and "software_uuid" in node: + return self._find_software_entry(uuid=node["software_uuid"]) + + # Optional Windows-style, case-insensitive fallback + # Only for callers that explicitly opt-in, to avoid changing ELF/Unix semantics. + if case_insensitive: + target = pathlib.PurePosixPath(norm_path) + parent = target.parent.as_posix() + basename_ci = target.name.casefold() + + if self.fs_tree.has_node(parent): + for _src, child, _data in self.fs_tree.out_edges(parent, data=True): + child_name_ci = pathlib.PurePosixPath(child).name.casefold() + if child_name_ci == basename_ci: + candidate = self.fs_tree.nodes.get(child) + if candidate and "software_uuid" in candidate: + logger.debug( + "[fs_tree] case-insensitive match: %s → %s", + norm_path, + child, + ) + return self._find_software_entry(uuid=candidate["software_uuid"]) + + # Attempt to resolve via symlink traversal (BFS) with a depth cap + visited = set() + queue = deque([(norm_path, 0)]) # (node, depth) + MAX_SYMLINK_STEPS = 1000 # conservative cap; adjust if needed + + while queue: + current, depth = queue.popleft() + + if current in visited: + continue + visited.add(current) + + # Depth cap guard + if depth > MAX_SYMLINK_STEPS: + logger.warning( + "[fs_tree] Aborting symlink traversal for %s after %d steps", + path, + MAX_SYMLINK_STEPS, + ) + break + + # If the node doesn't exist in the graph, there are no edges to follow + if not self.fs_tree.has_node(current): + continue + + # Check each symlink edge from current node + for _, target, attrs in self.fs_tree.out_edges(current, data=True): + if attrs.get("type") == "symlink": + target_node = self.fs_tree.nodes.get(target, {}) + if "software_uuid" in target_node: + logger.debug(f"[fs_tree] Resolved {path} via symlink: {current} → {target}") + return self._find_software_entry(uuid=target_node["software_uuid"]) + if target not in visited: + queue.append((target, depth + 1)) + + # No match found after traversal + return None + + def get_symlink_sources_for_path(self, path: str) -> List[str]: + """ + Retrieve all symlink paths (direct and transitive) that point to the given target path. + + This function performs a *reverse traversal* of the filesystem graph (`fs_tree`), + starting from the specified `path` and walking **incoming** symlink edges + (`link → target`) to collect every symlink node that ultimately resolves to + that target. + + The method is effectively the inverse of :meth:`get_software_by_path`, which + walks *outgoing* symlink edges to resolve a symlink to its destination. + + Behavior: + - Follows only edges with attribute ``type="symlink"``. + - Traverses breadth-first to handle multi-hop symlink chains + (e.g., A → B → C → /usr/bin/ls). + - Avoids cycles and repeated nodes using a visited set. + - Returns all normalized symlink paths that resolve to the given target path. + - Logs debug information for each edge visited and for each discovered source. + + Args: + path (str): The target filesystem path (can be POSIX or Windows-style). + + Returns: + List[str]: A sorted list of all symlink paths (direct or transitive) + that point to the provided target path. + Empty if none are found. + + Example: + Given: + /usr/bin/dirE/link_to_F → /usr/bin/dirF + /usr/bin/dirF/runthat → /usr/bin/echo + + Then: + get_symlink_sources_for_path("/usr/bin/echo") + → ["/usr/bin/dirF/runthat", "/usr/bin/dirE/link_to_F/runthat"] + """ + norm_target = normalize_path(path) + if not self.fs_tree.has_node(norm_target): + logger.debug(f"[fs_tree] Target path not found in graph: {norm_target}") + return [] + + results: Set[str] = set() + visited: Set[str] = set() + queue: deque[str] = deque([norm_target]) + + logger.debug(f"[fs_tree] Starting reverse symlink traversal from: {norm_target}") + + while queue: + current = queue.popleft() + if current in visited: + continue + visited.add(current) + + # Iterate over all incoming symlink edges: src → current + for src, _dst, attrs in self.fs_tree.in_edges(current, data=True): + if attrs.get("type") != "symlink": + continue + + if src not in results: + results.add(src) + logger.debug(f"[fs_tree] Found symlink source: {src} → {current}") + + # Continue traversal upward through the graph (transitive links) + if src not in visited: + queue.append(src) + + logger.debug( + f"[fs_tree] Reverse symlink traversal complete for {norm_target}: " + f"{len(results)} sources found." + ) + + return sorted(results) + + def build_rel_graph(self) -> None: """Rebuild the directed graph from systems, software, and any loaded relationships.""" self.graph = nx.MultiDiGraph() for sys in self.systems: @@ -216,51 +489,585 @@ def add_software(self, sw: Software) -> None: if not self.graph.has_node(sw.UUID): self.graph.add_node(sw.UUID, type="Software") + self._add_software_to_fs_tree(sw) + + def _add_symlink_edge( + self, src: str, dst: str, *, subtype: Optional[str] = None, log_prefix: str = "[fs_tree]" + ) -> None: + """ + Internal helper to safely add a symlink edge to both fs_tree and graph. + + Ensures: + - Both nodes exist in fs_tree and graph with type="Path". + - The fs_tree edge is labeled type="symlink" with optional subtype. + - The logical graph mirrors the same edge with key="symlink". + - Duplicate edges are ignored gracefully. + + Args: + src (str): Path of the symlink source. + dst (str): Target path of the symlink. + subtype (Optional[str]): File or directory indicator. + log_prefix (str): Optional prefix for debug logs. + """ + # Add edge to fs_tree if not present + if not self.fs_tree.has_edge(src, dst): + self.fs_tree.add_edge(src, dst, type="symlink", subtype=subtype) + logger.debug(f"{log_prefix} Added symlink edge: {src} → {dst} [subtype={subtype}]") + + # Ensure both nodes exist and are typed as Path in the logical graph + for node in (src, dst): + if not self.graph.has_node(node): + self.graph.add_node(node, type="Path") + elif "type" not in self.graph.nodes[node]: + self.graph.nodes[node]["type"] = "Path" + + # Add mirrored edge in graph if not already present + if not self.graph.has_edge(src, dst, key="symlink"): + self.graph.add_edge(src, dst, key="symlink") + logger.debug(f"[graph] Added symlink edge: {src} → {dst}") + + def _record_symlink( + self, link_path: str, target_path: str, *, subtype: Optional[str] = None + ) -> None: + """ + Record a filesystem symlink in both the SBOM's relationship graph and its fs_tree. + + This method adds the given symlink as a relationship between two filesystem + path nodes (`link_path` → `target_path`). It ensures the link exists in both + the logical relationship graph (`graph`) and the physical filesystem graph (`fs_tree`), + maintaining internal consistency between them. + + Steps: + 1. Normalize both input paths to POSIX format. + 2. If the target node does not yet exist in the fs_tree (common when the + target software entry is added later), queue the link for deferred + creation in `expand_pending_file_symlinks()`. + 3. Otherwise, create the primary symlink edge immediately using `_add_symlink_edge()`. + 4. If the symlink is a directory: + - Register it for deferred mirroring in `_pending_dir_links` + (expanded later by `expand_pending_dir_symlinks()`). + - Immediately synthesize one-hop chained symlinks for any direct, + non-symlink children of the target directory. + + Args: + link_path (str): Path of the symlink itself (e.g., "/opt/app/lib/foo.so"). + target_path (str): Absolute path of the resolved symlink target (e.g., "/usr/lib/foo.so"). + subtype (Optional[str]): Optional category for the symlink ("file" or "directory"). + + Behavior: + - Ensures link and target nodes exist as Path-type nodes in both graphs. + - Defers file symlinks whose targets are not yet known to ensure completeness. + - Adds missing edges consistently in both structures. + - Defers deeper directory mirroring to avoid recursive loops. + """ + + # ---------------------------------------------------------------------- + # Step 1: Normalize paths to consistent POSIX-style representation + # ---------------------------------------------------------------------- + link_node = normalize_path(link_path) + target_node = normalize_path(target_path) + + # ---------------------------------------------------------------------- + # Step 2: If target node is missing, defer file symlink creation + # ---------------------------------------------------------------------- + logger.debug(f"[fs_tree] subtype={subtype}") + if subtype != "directory" and not self.fs_tree.has_node(target_node): + self._pending_file_links.append((link_node, target_node, subtype)) + logger.debug(f"[fs_tree] Queued deferred file symlink: {link_node} → {target_node}") + return + + # ---------------------------------------------------------------------- + # Step 3: Create the primary symlink edge between link and target + # ---------------------------------------------------------------------- + self._add_symlink_edge(link_node, target_node, subtype=subtype) + + # ---------------------------------------------------------------------- + # Step 4: Handle directory symlinks — queue and synthesize one-hop children + # ---------------------------------------------------------------------- + if subtype == "directory": + # Register for deferred expansion after all directories are processed + self._pending_dir_links.append((link_node, target_node)) + logger.debug( + f"[fs_tree] Queued directory symlink for deferred expansion: " + f"{link_node} → {target_node}" + ) + + # Identify direct (non-symlink) children under the target directory + child_edges = [ + (src, dst) + for src, dst, data in self.fs_tree.edges(target_node, data=True) + if data.get("type") != "symlink" # include only structural edges + ] + + if not child_edges: + logger.debug( + f"[fs_tree] No immediate children found under {target_node}; skipping chained edges." + ) + return + + # Create one-hop chained symlink edges for each direct child + for _, child in child_edges: + child_basename = PurePosixPath(child).name + synthetic_link = normalize_path(str(PurePosixPath(link_node) / child_basename)) + + # Add synthetic child edge via the shared helper + self._add_symlink_edge(synthetic_link, child, subtype="file") + logger.debug( + f"[fs_tree] (immediate) Synthetic chained symlink created: " + f"{synthetic_link} → {child}" + ) + + def record_symlink( + self, link_path: str, target_path: str, *, subtype: Optional[str] = None + ) -> None: + """Public, stable API to record a filesystem symlink in the SBOM graphs. + + Validates inputs and delegates to the internal ``_record_symlink`` which + handles normalization, node/edge creation, and deduplication. + + Args: + link_path: Path of the symlink itself (install path). + target_path: Resolved absolute path of the symlink target. + subtype: Optional qualifier (e.g., "file" or "directory"). + """ + logger.debug(f"{link_path} -> {target_path} ({subtype})") + if not isinstance(link_path, str) or not isinstance(target_path, str): + raise TypeError("link_path and target_path must be strings") + if not link_path or not target_path: + raise ValueError("link_path and target_path must be non-empty") + + # Delegate to internal implementation (already normalizes & dedupes) + self._record_symlink(link_path, target_path, subtype=subtype) + + def record_hash_node(self, file_path: str, sha256: str) -> None: + """ + Record a hash equivalence edge between a filesystem path and its content hash. + + This method links the given file node to a virtual hash node (e.g., "sha256:"), + allowing the fs_tree to represent content-equivalence relationships across files. + Such hash-based edges enable detection of identical files that were copied, + flattened, or dereferenced during extraction—restoring logical symlink equivalence + and supporting later deduplication. + + Args: + file_path (str): Absolute or relative path of the file within the extraction root. + sha256 (str): SHA-256 digest string representing the file's content. + + Behavior: + - Normalizes `file_path` to POSIX-style notation for consistent node keys. + - Ensures the hash node exists in the fs_tree with type="Hash". + - Adds a directed "hash" edge from the file node → hash node. + + Example: + /usr/bin/su → sha256:f163759953aafc083e9ee25c20cda300ae01e37612eb24e54086cacffe1aca5a + """ + file_node = normalize_path(file_path) + hash_node = f"sha256:{sha256}" + + logger.debug(f"[fs_tree] Recording hash node for file: {file_node} (hash={hash_node})") + + # Ensure both nodes exist, guarding against accidental type overrides + if self.fs_tree.has_node(hash_node): + existing_type = self.fs_tree.nodes[hash_node].get("type") + if existing_type != "Hash": + logger.warning( + f"[fs_tree] Node {hash_node} already exists with type={existing_type}, " + "overwriting to type=Hash" + ) + else: + logger.debug(f"[fs_tree] Reusing existing hash node: {hash_node}") + else: + self.fs_tree.add_node(hash_node, type="Hash") + logger.debug(f"[fs_tree] Added new hash node: {hash_node}") + + self.fs_tree.nodes[hash_node]["type"] = "Hash" # enforce correct type + self.fs_tree.add_edge(file_node, hash_node, type="hash") + logger.debug(f"[fs_tree] Added hash edge: {file_node} → {hash_node} (type=hash)") + + def get_hash_equivalents(self, path_node: str) -> set[str]: + """ + Return all filesystem paths in fs_tree that share the same hash node + as the given target. Used as a fallback when symlink metadata is missing + but identical file content (hash) indicates equivalence. + + Args: + path_node (str): Normalized path node in fs_tree. + + Returns: + set[str]: Other filesystem path nodes that point to the same hash node. + """ + equivalents = set() + if not self.fs_tree.has_node(path_node): + logger.debug(f"[fs_tree] get_hash_equivalents: target node not found: {path_node}") + return equivalents + + # Find hash edges (path → sha256:...) + for _, hash_node, data in self.fs_tree.out_edges(path_node, data=True): + if data.get("type") == "hash": + logger.debug(f"[fs_tree] Found hash edge: {path_node} → {hash_node}") + # For each path that shares this hash node, collect siblings + for src, _ in self.fs_tree.in_edges(hash_node): + if src != path_node: + equivalents.add(src) + logger.debug(f"[fs_tree] Added hash-equivalent sibling: {src}") + + if equivalents: + logger.debug( + f"[fs_tree] get_hash_equivalents: {path_node} has {len(equivalents)} " + f"equivalent path(s): {sorted(equivalents)}" + ) + else: + logger.debug( + f"[fs_tree] get_hash_equivalents: no hash-equivalent siblings for {path_node}" + ) + + return equivalents + def add_software_entries( self, entries: Optional[List[Software]], parent_entry: Optional[Software] = None ): - """Add software entries, merging duplicates and preserving all relationship edges. + """ + Add software entries to the SBOM graph, merging duplicates, preserving existing edges, + attaching "Contains" relationships to an optional parent, and recording ANY + file- or directory-level symlinks under each installPath. Args: - entries (Optional[List[Software]]): A list of Software entries to add to the SBOM. - parent_entry (Optional[Software]): An optional parent software entry to add "Contains" relationships to. + entries (Optional[List[Software]]): list of Software instances to add. + parent_entry (Optional[Software]): if provided, attach a "Contains" edge from this parent to each entry. """ if not entries: return - # if a software entry already exists with a matching file hash, augment the info in the existing entry - for e in entries: - existing = self.find_software(e.sha256) - if existing and Software.check_for_hash_collision(existing, e): - logger.warning(f"Hash collision between {existing.name} and {e.name}") - - if not existing: - # new software → add node - self.add_software(e) - entry_uuid = e.UUID - else: - # duplicate → merge and redirect edges - kept_uuid, old_uuid = existing.merge(e) - # redirect *incoming* edges to the kept node + # if a software entry already exists with a matching file hash, augment the info in the existing entry + for sw in entries: + # Merge duplicates by sha256 (or insert if new) + existing = self.find_software(sw.sha256) + if existing and Software.check_for_hash_collision(existing, sw): + logger.warning(f"Hash collision between {existing.name} and {sw.name}") + + if existing: + # Merge into existing node + # Duplicate → merge data & edges, drop the old UUID + kept_uuid, old_uuid = existing.merge(sw) + logger.debug(f"Merged {sw.UUID} into {kept_uuid}, removing {old_uuid}") + + # Redirect *incoming* edges to the kept node for src, _, key, attrs in list(self.graph.in_edges(old_uuid, keys=True, data=True)): self.graph.add_edge(src, kept_uuid, key=key, **attrs) - # redirect *outgoing* edges from the old node + # Redirect *outgoing* edges from the old node for _, dst, key, attrs in list( self.graph.out_edges(old_uuid, keys=True, data=True) ): self.graph.add_edge(kept_uuid, dst, key=key, **attrs) - # remove the old UUID entirely + # Remove the old UUID entirely if self.graph.has_node(old_uuid): self.graph.remove_node(old_uuid) - entry_uuid = kept_uuid + node_uuid = kept_uuid - # if a parent/package container was provided, attach a "Contains" edge + else: + # New software → add node + self.add_software(sw) + node_uuid = sw.UUID + logger.debug(f"Added new software node {node_uuid}") + + # ------------------------------------------------------------------ + # If another software entry already has the same sha256, link them by hash equivalence + # ------------------------------------------------------------------ + if sw.sha256: + for other in self.software: + if other is not sw and other.sha256 == sw.sha256: + # Both files share the same content hash — link them to the same hash node + for path in sw.installPath or []: + try: + self.record_hash_node(path, sw.sha256) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning( + f"[fs_tree] Failed to record hash link for {path}: {e}" + ) + for path in other.installPath or []: + try: + self.record_hash_node(path, sw.sha256) + except Exception as e: # pylint: disable=broad-exception-caught + logger.warning( + f"[fs_tree] Failed to record hash link for {path}: {e}" + ) + logger.debug( + f"[fs_tree] Linked identical content by hash: " + f"{sw.installPath} ↔ {other.installPath}" + ) + break + + # Attach a Contains edge from parent, if any if parent_entry: parent_uuid = parent_entry.UUID - if not self.graph.has_edge(parent_uuid, entry_uuid, key="Contains"): - self.graph.add_edge(parent_uuid, entry_uuid, key="Contains") + if not self.graph.has_edge(parent_uuid, node_uuid, key="Contains"): + self.graph.add_edge(parent_uuid, node_uuid, key="Contains") + logger.debug(f"Attached Contains edge: {parent_uuid} → {node_uuid}") + + # Symlink capture under each installPath --- + for raw in sw.installPath or []: + p = pathlib.Path(raw) + + # If the installPath itself is a symlink (file or dir) + if p.is_symlink(): + real = p.resolve() + subtype = "file" if not p.is_dir() else "directory" + logger.debug(f"Found installPath symlink: {p} → {real} (subtype={subtype})") + # Call the helper to record this symlink in fs_tree + self._record_symlink(str(p), str(real), subtype=subtype) + + # If it's a directory, scan immediate children for symlinks + if p.is_dir(): + for child in p.iterdir(): + if child.is_symlink(): + real = child.resolve() + subtype = "file" if not child.is_dir() else "directory" + logger.debug( + f"Found child symlink: {child} → {real} (subtype={subtype})" + ) + self._record_symlink(str(child), str(real), subtype=subtype) + + def expand_pending_dir_symlinks(self) -> None: + """ + Expand all deferred directory symlinks recorded in `_pending_dir_links`. + + Each deferred pair `(link_node, target_node)` represents a directory-level + symlink such as `/usr/bin/dirE/link_to_F → /usr/bin/dirF`. + + This function performs a one-hop mirror expansion to create synthetic + symlink edges linking each immediate child of the target directory back + under the symlink source. For example: + + /usr/bin/dirE/link_to_F/runthat → /usr/bin/dirF/runthat + + The goal is to replicate the main branch’s behavior for cross-directory + mirroring (e.g., dirE ↔ dirF) without over-expanding into recursive + `link_to_F/link_to_E/...` chains. + + Behavior: + - Processes only valid directory symlink targets already present in `fs_tree`. + - Collects *depth-1* descendants (immediate children) of the target directory. + - Skips already-existing synthetic edges to avoid duplication. + - Ensures that mirrored nodes are properly typed as `Path` in both graphs. + - Mirrors all edges into both `fs_tree` and `graph` for consistency. + """ + + pending_count = len(self._pending_dir_links) + logger.debug(f"[fs_tree] Expanding {pending_count} pending directory symlinks") + + # ---------------------------------------------------------------------- + # Process each deferred directory symlink pair + # ---------------------------------------------------------------------- + for link_node, target_node in list(self._pending_dir_links): + if not self.fs_tree.has_node(target_node): + logger.debug( + f"[fs_tree] Skipping {link_node} → {target_node} (target missing in fs_tree)" + ) + continue + + # Normalize and prepare for prefix-based matching + target_prefix = target_node.rstrip("/") + "/" + + # ------------------------------------------------------------------ + # Collect immediate child nodes (depth-1 only, avoid recursive nesting) + # ------------------------------------------------------------------ + immediate_children: List[str] = [] + for child in list(self.fs_tree.nodes): + if child.startswith(target_prefix) and child != target_node: + tail = child[len(target_prefix) :] + if "/" not in tail and tail: # ensure depth-1 only + immediate_children.append(child) + + logger.debug( + f"[fs_tree] Deferred mirror for {link_node} → {target_node}: " + f"{len(immediate_children)} immediate children found" + ) + + # ------------------------------------------------------------------ + # Create synthetic edges for each immediate child + # ------------------------------------------------------------------ + for child in immediate_children: + # Derive the synthetic symlink path using normalize_path + basename_posix + child_basename = basename_posix(child) + synthetic_link = normalize_path(link_node, child_basename) + + # Skip if this symlink edge already exists + if self.fs_tree.has_edge(synthetic_link, child): + logger.debug(f"[fs_tree] Skipping existing edge: {synthetic_link} → {child}") + continue + + # Add synthetic symlink edge to both fs_tree and graph + self._add_symlink_edge(synthetic_link, child, subtype="file") + logger.debug( + f"[fs_tree] (deferred) Synthetic chained symlink created: " + f"{synthetic_link} → {child}" + ) + + logger.debug( + f"[fs_tree] Deferred symlink expansion complete — processed {pending_count} entries." + ) + + logger.debug( + f"[fs_tree] Deferred symlink expansion complete — processed {pending_count} entries." + ) + + def expand_pending_file_symlinks(self) -> None: + """ + Expand all deferred file symlinks recorded in `_pending_file_links`. + + Ensures that file-level symlinks pointing to targets that were not yet + added to the fs_tree at record time are created once the full graph exists. + """ + pending_count = len(self._pending_file_links) + logger.debug(f"[fs_tree] Expanding {pending_count} pending file symlinks") + + for link_node, target_node, subtype in list(self._pending_file_links): + if self.fs_tree.has_edge(link_node, target_node): + continue + if not self.fs_tree.has_node(target_node): + logger.debug( + f"[fs_tree] Skipping deferred file link {link_node} → {target_node} (missing target)" + ) + continue + self._add_symlink_edge(link_node, target_node, subtype=subtype) + logger.debug(f"[fs_tree] Deferred file symlink created: {link_node} → {target_node}") + + self._pending_file_links.clear() + + def inject_symlink_metadata(self) -> None: + """ + Populate legacy-style symlink metadata into each Software entry using fs_tree + relationships, hash-equivalence, and gathered filename aliases. + + This method restores compatibility with legacy SBOM outputs by reintroducing + metadata fields that explicitly describe file aliasing relationships. + + It collects three classes of alias information: + + 1. **Filesystem Symlinks:** incoming symlink edges in `fs_tree` + (e.g., "usr/sbin/runuser" → "usr/bin/su") + + 2. **Hash-Equivalent Siblings:** other files that share identical content + (sha256) but appear at different install paths. + + 3. **Gathered Filename Aliases:** additional names from `sw.fileName` + that were injected during the gather phase but are not canonical + basenames of the install paths (e.g., bash-completion stubs like + "runuser" for "su"). + + The resulting metadata entries are merged or appended under each + `Software.metadata` list in a legacy-compatible format: + + - ``fileNameSymlinks`` — list of alternate basenames + - ``installPathSymlinks`` — list of alternate full install paths + + This operation: + • Traverses all Software entries + • Derives alias sets from symlink edges, identical hashes, and fileName extras + • Merges metadata without duplication + • Does *not* alter fs_tree or graph topology + + Example output: + { + "fileName": ["su"], + "metadata": [ + {"fileNameSymlinks": ["runuser"]}, + {"installPathSymlinks": ["usr/sbin/runuser"]} + ] + } + """ + + logger.debug("[fs_tree] Injecting legacy-style symlink metadata into Software entries") + + # ---------------------------------------------------------------------- + # Iterate over all software entries and derive metadata from fs_tree + # ---------------------------------------------------------------------- + for sw in self.software: + if not sw.installPath: + continue + + file_symlinks = set() + path_symlinks = set() + + # ------------------------------------------------------------------ + # For each installPath, gather direct and indirect symlink aliases + # ------------------------------------------------------------------ + for path in sw.installPath: + logger.debug(f"[fs_tree] Processing installPath for symlink injection: {path}") + + # -------------------------------------------------------------- + # 1. Reverse lookup: find symlink nodes that point to this path + # -------------------------------------------------------------- + sources = self.get_symlink_sources_for_path(path) + if sources: + logger.debug(f"[fs_tree] Found symlink sources for {path}: {sources}") + for src in sources: + path_symlinks.add(src) + file_symlinks.add(PurePosixPath(src).name) + + # -------------------------------------------------------------- + # 2. Include hash-equivalent siblings (same content) + # -------------------------------------------------------------- + hash_equivs = self.get_hash_equivalents(path) + if hash_equivs: + logger.debug( + f"[fs_tree] Found hash-equivalent siblings for {path}: {hash_equivs}" + ) + for equiv in hash_equivs: + if equiv not in path_symlinks: + path_symlinks.add(equiv) + file_symlinks.add(PurePosixPath(equiv).name) + + # ------------------------------------------------------------------ + # 3. Add gathered filename aliases not tied to install basenames + # ------------------------------------------------------------------ + primary_basenames = {PurePosixPath(p).name for p in (sw.installPath or [])} + file_name_extras = set(sw.fileName or []) - primary_basenames + if file_name_extras: + file_symlinks |= file_name_extras + logger.debug( + f"[fs_tree] Added gathered filename aliases for {sw.UUID}: {sorted(file_name_extras)}" + ) + + # ------------------------------------------------------------------ + # Skip entries with no discovered symlink or hash equivalence + # ------------------------------------------------------------------ + if not (file_symlinks or path_symlinks): + continue + + # ------------------------------------------------------------------ + # 4. Merge alias metadata into Software.metadata + # ------------------------------------------------------------------ + if sw.metadata is None: + sw.metadata = [] + + def _merge_md(key: str, values: set[str], *, _sw: Software = sw) -> None: + """Merge or append a metadata entry for the given key, avoiding duplication.""" + if not values: + return + merged = sorted(values) + for md in _sw.metadata: + if isinstance(md, dict) and key in md: + existing = set(md[key]) + md[key] = sorted(existing | set(merged)) + logger.debug(f"[fs_tree] Merged {key} for {_sw.UUID}: {md[key]}") + break + else: + _sw.metadata.append({key: merged}) + logger.debug(f"[fs_tree] Appended {key} for {_sw.UUID}: {merged}") + + _merge_md("fileNameSymlinks", file_symlinks) + _merge_md("installPathSymlinks", path_symlinks) + + # Optional: legacy-style alias duplication into fileName[] + for alias in file_symlinks: + if alias not in sw.fileName: + sw.fileName.append(alias) + logger.debug(f"[fs_tree] Added alias '{alias}' to fileName for {sw.UUID}") + + logger.debug("[fs_tree] Completed symlink metadata injection pass") # pylint: disable=too-many-arguments def create_software( @@ -379,6 +1186,14 @@ def merge(self, sbom_m: SBOM): # 3) Merge relationships from the incoming SBOM’s MultiDiGraph for src, dst, rel_type in sbom_m.graph.edges(keys=True): + # Skip path/symlink edges during merge as well + if str(rel_type).lower() == "symlink": + continue + if sbom_m.graph.nodes.get(src, {}).get("type") == "Path": + continue + if sbom_m.graph.nodes.get(dst, {}).get("type") == "Path": + continue + # apply any UUID remaps from merged systems/software xUUID = uuid_updates.get(src, src) yUUID = uuid_updates.get(dst, dst) @@ -589,27 +1404,56 @@ def get_parents(self, yUUID: str, rel_type: Optional[str] = None) -> List[str]: def to_dict_override(self) -> dict: """ - Dump all SBOM dataclass fields (via asdict), strip out internal-only - fields, convert sets→lists, and then build a fresh - 'relationships' list by iterating every edge key in the MultiDiGraph. + Convert the SBOM object into a serializable dictionary for JSON output, + excluding internal graph structures and filtering out non-logical + (filesystem-related) relationships. + + This method performs the following steps: + 1. Creates a dictionary from the SBOM dataclass fields using `asdict()`. + 2. Removes internal-only attributes that should not be serialized + (`graph`, `fs_tree`, `_loaded_relationships`, `_pending_dir_links`, + `_pending_file_links`). + 3. Converts any `set` values in the remaining fields to `list` so the + output is JSON-compatible. + 4. Builds a filtered `relationships` list from the SBOM's main `graph`: + - Skips any edges where the key (relationship type) is `"symlink"`. + - Skips edges where either endpoint node has `type="Path"`, indicating + the node represents a filesystem path rather than a logical software + entity. + - Keeps only logical relationships between software UUIDs or other + non-path entities. + + Returns: + dict: A JSON-serializable representation of the SBOM, with only + logical relationships included in the `relationships` list. """ - # Grab everything as a dict + # Start with the dataclass dump and strip internals data = asdict(self) - - # Remove fields we never want in JSON data.pop("graph", None) + data.pop("fs_tree", None) data.pop("_loaded_relationships", None) + data.pop("_pending_dir_links", None) + data.pop("_pending_file_links", None) - # Turn any sets into lists for JSON + # Convert sets → lists for JSON for k, v in list(data.items()): if isinstance(v, set): data[k] = list(v) - # Rebuild 'relationships' from the graph's edge keys - data["relationships"] = [ - {"xUUID": u, "yUUID": v, "relationship": key} - for u, v, key in self.graph.edges(keys=True) - ] + # Only emit logical relationships (exclude filesystem/path symlinks) + rels = [] + for u, v, key in self.graph.edges(keys=True): + # Skip symlink edges + if str(key).lower() == "symlink": + continue + # Skip any edge where either endpoint is a filesystem Path node + utype = self.graph.nodes.get(u, {}).get("type") + vtype = self.graph.nodes.get(v, {}).get("type") + if utype == "Path" or vtype == "Path": + continue + rels.append({"xUUID": u, "yUUID": v, "relationship": key}) + + data["relationships"] = rels return data diff --git a/surfactant/utils/paths.py b/surfactant/utils/paths.py new file mode 100644 index 000000000..bb36d9756 --- /dev/null +++ b/surfactant/utils/paths.py @@ -0,0 +1,54 @@ +import pathlib +from typing import Union + + +def normalize_path(*path_parts: Union[str, pathlib.PurePosixPath]) -> str: + """ + Normalize one or more path parts into a single POSIX-style path string. + + This function ensures that Windows-style path separators ('\\') in string + inputs are replaced with forward slashes ('/'), producing a consistent, + POSIX-style representation. However, if any argument is already a + ``pathlib.PurePath`` (such as a PurePosixPath), it is passed through + unchanged — ensuring that literal backslashes within file or directory + names are preserved. + + Args: + *path_parts: One or more path components. Each may be a string + (which will be normalized) or a PurePath object (which will be + preserved as-is). + + Returns: + str: A normalized POSIX-style path string (e.g., 'C:/Program Files/App'). + + Examples: + >>> normalize_path("C:\\Program Files\\App") + 'C:/Program Files/App' + >>> normalize_path(PurePosixPath("foo\\bar")) + 'foo\\bar' + """ + cleaned_parts = [ + # If this part is already a PurePath (e.g. PurePosixPath), + # don't modify it — we assume it already uses the correct separators. + # Otherwise, replace Windows backslashes in string inputs. + p if isinstance(p, pathlib.PurePath) else str(p).replace("\\", "/") + for p in path_parts + ] + + # Join all parts into a single PurePosixPath, ensuring POSIX separators. + # The resulting string will always use '/' as path delimiters. + return pathlib.PurePosixPath(*cleaned_parts).as_posix() + + +def basename_posix(path: Union[str, pathlib.PurePath]) -> str: + """ + Return the POSIX-style basename of a path. Never raises for string inputs. + - Uses normalize_path for consistent slash handling. + - Strips trailing slash for non-root paths so 'dir/' -> 'dir'. + """ + s = normalize_path(path) # ensures a POSIX string + if s and s != "/": + s = s.rstrip("/") # keep '/' as-is; makes 'dir/' -> 'dir' + # For strings, PurePosixPath(...).name returns: + # '' for '' / '.' / '/' ; 'dir' for 'dir' or 'dir/' + return pathlib.PurePosixPath(s).name diff --git a/tests/relationships/test_dotnet_relationship.py b/tests/relationships/test_dotnet_relationship.py new file mode 100644 index 000000000..c073fed05 --- /dev/null +++ b/tests/relationships/test_dotnet_relationship.py @@ -0,0 +1,286 @@ +# pylint: disable=redefined-outer-name +import pytest + +from surfactant.relationships import dotnet_relationship +from surfactant.sbomtypes import SBOM, Relationship, Software + + +@pytest.fixture +def sbom_fixture(): + """ + Fixture: returns a basic SBOM with a .NET supplier and consumer. + - Supplier exports SomeLibrary.dll with version metadata. + - Consumer references SomeLibrary.dll in its dotnetAssemblyRef. + """ + sbom = SBOM() + + supplier = Software( + UUID="uuid-supplier", + fileName=["SomeLibrary.dll"], + installPath=["/app/bin/SomeLibrary.dll"], + metadata=[{"dotnetAssembly": {"Name": "SomeLibrary", "Version": "1.0.0.0"}}], + ) + + consumer = Software( + UUID="uuid-consumer", + installPath=["/app/bin/App.exe"], + metadata=[{"dotnetAssemblyRef": [{"Name": "SomeLibrary", "Version": "1.0.0.0"}]}], + ) + + sbom.add_software(supplier) + sbom.add_software(consumer) + + return sbom, consumer, supplier + + +def test_dotnet_fs_tree_match(sbom_fixture): + """ + Test Phase 1: fs_tree resolution using get_software_by_path. + Ensures the plugin emits a relationship if the path is indexed. + """ + sbom, consumer, supplier = sbom_fixture + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [Relationship(consumer.UUID, supplier.UUID, "Uses")] + + +def test_dotnet_codebase_match(): + """ + Test: codeBase.href resolution from app.config. + Ensures href is respected as a valid relative match. + """ + sbom = SBOM() + + supplier = Software(UUID="uuid-lib", fileName=["lib.dll"], installPath=["/app/private/lib.dll"]) + + consumer = Software( + UUID="uuid-app", + installPath=["/app/main.exe"], + metadata=[ + { + "dotnetAssemblyRef": [{"Name": "lib"}], + "appConfigFile": { + "runtime": { + "assemblyBinding": { + "dependentAssembly": [{"codeBase": {"href": "private/lib.dll"}}] + } + } + }, + } + ], + ) + + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [Relationship("uuid-app", "uuid-lib", "Uses")] + + +def test_dotnet_implmap_unmanaged_match(): + """ + Test: unmanaged import from dotnetImplMap should resolve as native. + Ensures fallback probing with name variants like native.dll, native.so, etc. + """ + sbom = SBOM() + + supplier = Software( + UUID="uuid-native", fileName=["native.so"], installPath=["/app/lib/native.so"] + ) + + consumer = Software( + UUID="uuid-consumer", + installPath=["/app/lib/main.exe"], + metadata=[{"dotnetImplMap": [{"Name": "native"}], "dotnetAssemblyRef": []}], + ) + + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [Relationship("uuid-consumer", "uuid-native", "Uses")] + + +def test_dotnet_same_directory(): + """ + Test: assembly in same directory as consumer should be resolved. + Covers legacy phase and base probing behavior. + """ + sbom = SBOM() + supplier = Software( + UUID="lib1", fileName=["samedirlib.dll"], installPath=["/app/samedirlib.dll"] + ) + consumer = Software( + UUID="app", + installPath=["/app/main.exe"], + metadata=[{"dotnetAssemblyRef": [{"Name": "samedirlib"}]}], + ) + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [Relationship("app", "lib1", "Uses")] + + +def test_dotnet_subdir(): + """ + Test: DLL in legacy-probed subdirectory is found by probing. + Covers Phase 2 fallback behavior. + """ + sbom = SBOM() + supplier = Software( + UUID="lib2", fileName=["subdirlib.dll"], installPath=["/app/subdirlib/subdirlib.dll"] + ) + consumer = Software( + UUID="app", + installPath=["/app/main.exe"], + metadata=[{"dotnetAssemblyRef": [{"Name": "subdirlib"}]}], + ) + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [Relationship("app", "lib2", "Uses")] + + +def test_dotnet_culture_subdir(): + """ + Test: DLL match is allowed only if the Culture metadata agrees. + + Important note: + - The .NET resolver does not auto-probe culture-specific subdirectories. + - Instead, it uses the Culture field as a filter when evaluating candidates. + - In this case the supplier resides under '/app/culture/' and declares + Culture='culture', while the consumer requests Culture='culture'. + - Because version/culture filters pass, the supplier is accepted. + + This test ensures culture mismatches are excluded and matches are accepted + only when Culture aligns. + """ + sbom = SBOM() + supplier = Software( + UUID="lib3", fileName=["culturelib.dll"], installPath=["/app/culture/culturelib.dll"] + ) + consumer = Software( + UUID="app", + installPath=["/app/main.exe"], + metadata=[{"dotnetAssemblyRef": [{"Name": "culturelib", "Culture": "culture"}]}], + ) + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [Relationship("app", "lib3", "Uses")] + + +def test_dotnet_no_match_without_exact_basename(): + """ + Test: a DLL in the same directory is NOT matched when its basename does + not align with the referenced assembly name. + + Scenario: + - Consumer imports 'heur' (filename variants: 'heur', 'heur.dll'). + - Provider lives in the same directory but is installed as + '/app/bin/heur.dll.bak'. + + Expected behavior: + - Phase 1 (fs_tree): no exact path '/app/bin/heur' or '/app/bin/heur.dll'. + - Phase 2 (installPath + fileName): installPath does not end with + 'heur' or 'heur.dll', so no match is accepted. + - No heuristic "same-directory" fallback is applied, so no relationship + is created. + """ + sbom = SBOM() + + supplier = Software( + UUID="lib-heur", + fileName=["heur.dll"], + installPath=["/app/bin/heur.dll.bak"], + ) + + consumer = Software( + UUID="app-heur", + installPath=["/app/bin/app.exe"], + metadata=[{"dotnetAssemblyRef": [{"Name": "heur"}]}], + ) + + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + + assert results == [] + + +def test_dotnet_private_path(): + """ + Test: DLL resolved from app.config probing.privatePath directories. + Ensures private paths are appended to probe set. + """ + sbom = SBOM() + supplier = Software( + UUID="lib4", fileName=["pvtlib.dll"], installPath=["/app/bin/custom/pvtlib.dll"] + ) + consumer = Software( + UUID="app", + installPath=["/app/bin/app.exe"], + metadata=[ + { + "dotnetAssemblyRef": [{"Name": "pvtlib"}], + "appConfigFile": { + "runtime": {"assemblyBinding": {"probing": {"privatePath": "custom"}}} + }, + } + ], + ) + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [Relationship("app", "lib4", "Uses")] + + +# def test_dotnet_version_mismatch_filtered(): +# """ +# Test: supplier has wrong version; should be filtered out by version check. +# """ +# sbom = SBOM() +# supplier = Software( +# UUID="lib5", +# fileName=["wrong.dll"], +# installPath=["/lib/wrong.dll"], +# metadata=[{"dotnetAssembly": {"Name": "wrong", "Version": "2.0.0.0"}}], +# ) +# consumer = Software( +# UUID="app", +# installPath=["/lib/app.exe"], +# metadata=[{"dotnetAssemblyRef": [{"Name": "wrong", "Version": "1.0.0.0"}]}], +# ) +# sbom.add_software(supplier) +# sbom.add_software(consumer) + +# results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) +# assert results == [] + + +def test_dotnet_culture_mismatch_filtered(): + """ + Test: supplier has wrong culture; should be filtered out by culture check. + """ + sbom = SBOM() + supplier = Software( + UUID="lib6", + fileName=["wrongcult.dll"], + installPath=["/lib/wrongcult.dll"], + metadata=[{"dotnetAssembly": {"Name": "wrongcult", "Culture": "xx"}}], + ) + consumer = Software( + UUID="app", + installPath=["/lib/app.exe"], + metadata=[{"dotnetAssemblyRef": [{"Name": "wrongcult", "Culture": "yy"}]}], + ) + sbom.add_software(supplier) + sbom.add_software(consumer) + + results = dotnet_relationship.establish_relationships(sbom, consumer, consumer.metadata[0]) + assert results == [] diff --git a/tests/relationships/test_elf_relationship.py b/tests/relationships/test_elf_relationship.py new file mode 100644 index 000000000..528a09cbd --- /dev/null +++ b/tests/relationships/test_elf_relationship.py @@ -0,0 +1,181 @@ +# pylint: disable=redefined-outer-name +import pathlib + +import pytest + +from surfactant.relationships import elf_relationship +from surfactant.relationships.elf_relationship import establish_relationships +from surfactant.sbomtypes import SBOM, Relationship, Software + + +@pytest.fixture +def example_sbom(): + sbom = SBOM() + + sw1 = Software(UUID="uuid-1", fileName=["libfoo.so.1"], installPath=["/usr/lib/libfoo.so.1"]) + sw2 = Software(UUID="uuid-2", fileName=["libbar.so"], installPath=["/opt/myapp/lib/libbar.so"]) + + sw3a = Software( + UUID="uuid-3a", + installPath=["/opt/myapp/bin/myapp"], + metadata=[{"elfDependencies": ["/usr/lib/libfoo.so.1"]}], + ) + sw3b = Software( + UUID="uuid-3b", + installPath=["/opt/myapp/bin/myapp"], + metadata=[{"elfDependencies": ["libbar.so"], "elfRunpath": ["$ORIGIN/../lib"]}], + ) + sw4_consumer = Software( + UUID="uuid-4-consumer", + installPath=["/bin/testbin"], + metadata=[{"elfDependencies": ["libxyz.so"]}], + ) + sw4 = Software( + UUID="uuid-4", + fileName=["libxyz.so"], + installPath=["/lib/libxyz.so"], + metadata=[{"elfDependencies": ["libxyz.so"]}], + ) + sw5 = Software(UUID="uuid-5", fileName=["libdep.so"], installPath=["/app/lib/libdep.so"]) + sw6 = Software( + UUID="uuid-6", + installPath=["/app/bin/mybin"], + metadata=[{"elfDependencies": ["libdep.so"], "elfRunpath": ["$ORIGIN/../lib"]}], + ) + sw7 = Software( + UUID="uuid-7", + installPath=["/legacy/bin/legacyapp"], + metadata=[{"elfDependencies": ["libbar.so"], "elfRpath": ["/opt/myapp/lib"]}], + ) + sw8 = Software(UUID="uuid-8", fileName=["libalias.so"], installPath=["/opt/alt/lib/libreal.so"]) + sw9 = Software( + UUID="uuid-9", + installPath=["/opt/alt/bin/app"], + metadata=[{"elfDependencies": ["libalias.so"], "elfRunpath": ["/opt/alt/lib"]}], + ) + + # First add all software so fs_tree has the real file nodes + for sw in [sw1, sw2, sw3a, sw3b, sw4, sw4_consumer, sw5, sw6, sw7, sw8, sw9]: + sbom.add_software(sw) + + # Now add the symlink mapping for sw8 (alias -> real file) + sbom.record_symlink("/opt/alt/lib/libalias.so", "/opt/alt/lib/libreal.so", subtype="file") + + # And expand pending file symlinks so fs_tree exposes the alias path + sbom.expand_pending_file_symlinks() + + return sbom, { + "absolute": (sw3a, "uuid-1"), + "relative": (sw3b, "uuid-2"), + "system": (sw4_consumer, "uuid-4"), + "origin": (sw6, "uuid-5"), + "rpath": (sw7, "uuid-2"), + "symlink": (sw9, "uuid-8"), + } + + +@pytest.mark.parametrize("label", ["absolute", "relative", "system", "origin", "rpath", "symlink"]) +def test_elf_relationship_cases(example_sbom, label): + """ + Validate ELF relationship resolution across multiple scenarios. + + This test is parameterized to exercise the six primary resolution paths used by + the ELF plugin. For each `label`, the `example_sbom` fixture returns: + - `sw`: the consumer `Software` object under test + - `expected_uuid`: the UUID of the supplier `Software` that should be linked via a + `Relationship(sw.UUID, expected_uuid, "Uses")` + + The cases covered: + - "absolute": dependency is an absolute path (e.g., /usr/lib/libfoo.so.1) + - "relative": dependency name + runpath derived from $ORIGIN, e.g. "$ORIGIN/../lib" + - "system": dependency resolved via standard system library directories (e.g., /lib) + - "origin": dependency resolved via $ORIGIN expansion relative to the binary + - "rpath": dependency resolved via legacy RPATH entries + - "symlink": dependency resolved through a symlink edge in the SBOM fs_tree + + Expectations: + - Exactly one "Uses" relationship is emitted. + - The dependency resolves to `expected_uuid`, and never to the consumer itself. + """ + # Debug prints are helpful during bring-up, but can be noisy in CI. + # Keep them for now; if logs are cluttered, consider replacing with logger.debug or removing. + print(f"==== RUNNING: {label} ====") + sbom, case_map = example_sbom + + # Retrieve the consumer under test and the expected supplier UUID + sw, expected_uuid = case_map[label] + + # Pull the ELF metadata for this software (may include elfDependencies, elfRunpath/Rpath, etc.) + metadata = sw.metadata[0] if sw.metadata else {} + print("Dependency paths:", metadata.get("elfDependencies", [])) + print("fs_tree nodes:", list(sbom.fs_tree.nodes)) + + # Optional trace: show how raw dependency strings normalize to POSIX and what fs_tree returns + for dep in metadata.get("elfDependencies", []): + norm = pathlib.PurePosixPath(dep).as_posix() + print(f"Trying lookup: {norm} ->", sbom.get_software_by_path(norm)) + + # Execute the plugin and assert a single, correct relationship is produced + result = elf_relationship.establish_relationships(sbom, sw, metadata) + + # Sanity checks: one result, and it matches the expected supplier UUID + assert result is not None, f"{label} case failed: no result" + assert len(result) == 1, f"{label} case failed: expected 1 relationship" + assert result[0] == Relationship(sw.UUID, expected_uuid, "Uses"), ( + f"{label} case mismatch: {result[0]} != {expected_uuid}" + ) + + +def test_no_match_edge_case(): + """ + Test case: No matching dependency by any means (fs_tree, legacy, or heuristic). + Expect no relationships. + """ + binary = Software( + UUID="bin-uuid", + fileName=["mybin"], + installPath=["/some/bin/mybin"], + metadata=[{"elfDependencies": ["libnotfound.so"], "elfRunpath": ["/some/lib"]}], + ) + + unrelated = Software( + UUID="unrelated-uuid", + fileName=["libsomethingelse.so"], + installPath=["/unrelated/path/libsomethingelse.so"], + ) + + sbom = SBOM(systems=[], hardware=[], software=[binary, unrelated]) + + metadata = binary.metadata[0] + results = establish_relationships(sbom, binary, metadata) + + assert results is not None + assert len(results) == 0, "Expected no relationships for unmatched dependency" + + +def test_symlink_heuristic_guard(): + """ + Tests that the symlink heuristic does not falsely match entries where + fileName matches but installPath is in a different directory. + """ + binary = Software( + UUID="bin-uuid", + fileName=["myapp"], + installPath=["/opt/app/bin/myapp"], + metadata=[{"elfDependencies": ["libalias.so"], "elfRunpath": ["/opt/app/lib"]}], + ) + + # Same file name, but located in a different directory -> should NOT match + candidate = Software( + UUID="falsematch-uuid", fileName=["libalias.so"], installPath=["/different/dir/libalias.so"] + ) + + sbom = SBOM(systems=[], hardware=[], software=[binary, candidate]) + + metadata = binary.metadata[0] + results = establish_relationships(sbom, binary, metadata) + + assert results is not None + assert all(rel.yUUID != "falsematch-uuid" for rel in results), ( + "Heuristic should not have matched" + ) diff --git a/tests/relationships/test_java.py b/tests/relationships/test_java.py deleted file mode 100644 index 64a41c34f..000000000 --- a/tests/relationships/test_java.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2023 Lawrence Livermore National Security, LLC -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT - -from surfactant.plugin.manager import get_plugin_manager -from surfactant.sbomtypes import SBOM, Relationship, Software - -sbom = SBOM( - software=[ - Software( - UUID="supplier", - fileName=["supplier"], - installPath=["supplier"], - metadata=[{"javaClasses": {"dummy": {"javaExports": ["someFunc():void"]}}}], - ), - Software( - UUID="consumer", - fileName=["consumer"], - installPath=["consumer"], - metadata=[ - { - "javaClasses": { - "dummy": { - "javaExports": [], - "javaImports": ["someFunc():void"], - }, - }, - }, - ], - ), - ], -) - - -def test_java_relationship(): - javaPlugin = get_plugin_manager().get_plugin("surfactant.relationships.java_relationship") - sw = sbom.software[1] - md = sw.metadata[0] - assert javaPlugin.establish_relationships(sbom, sw, md) == [ - Relationship("consumer", "supplier", "Uses") - ] diff --git a/tests/relationships/test_java_relationship.py b/tests/relationships/test_java_relationship.py new file mode 100644 index 000000000..a78f9a7d5 --- /dev/null +++ b/tests/relationships/test_java_relationship.py @@ -0,0 +1,143 @@ +# pylint: disable=redefined-outer-name +import pytest + +from surfactant.relationships import java_relationship +from surfactant.sbomtypes import SBOM, Relationship, Software + + +@pytest.fixture +def java_class_path(): + return "com/example/HelloWorld.class" + + +@pytest.fixture +def test_sbom(): + sbom = SBOM() + + # Software exporting a class + jar_supplier = Software( + UUID="uuid-supplier", + fileName=["HelloWorld.class"], + installPath=["/app/lib/com/example/HelloWorld.class"], + metadata=[ + { + "javaClasses": { + "com.example.HelloWorld": { + "javaExports": ["com.example.HelloWorld"], + "javaImports": [], + } + } + } + ], + ) + + # Software importing that class (dependency) + jar_importer = Software( + UUID="uuid-importer", + fileName=["app.jar"], + installPath=["/app/bin/app.jar"], + metadata=[ + { + "javaClasses": { + "com.example.Main": { + "javaExports": ["com.example.Main"], + "javaImports": ["com.example.HelloWorld"], + } + } + } + ], + ) + + sbom.add_software(jar_supplier) + sbom.add_software(jar_importer) + + return sbom, jar_importer, jar_supplier + + +def test_phase_1_fs_tree_match(): + """ + Phase 1: sbom.get_software_by_path() should resolve when the importer’s + base dir + class path points at a real node in fs_tree. + """ + sbom = SBOM() + + supplier = Software( + UUID="uuid-supplier", + fileName=["HelloWorld.class"], + installPath=["/app/lib/com/example/HelloWorld.class"], + metadata=[ + {"javaClasses": {"com.example.HelloWorld": {"javaExports": ["com.example.HelloWorld"]}}} + ], + ) + + importer = Software( + UUID="uuid-importer", + installPath=["/app/lib/app.jar"], # NOTE: importer now under /app/lib + metadata=[ + {"javaClasses": {"com.example.Main": {"javaImports": ["com.example.HelloWorld"]}}} + ], + ) + + sbom.add_software(supplier) + sbom.add_software(importer) + + results = java_relationship.establish_relationships(sbom, importer, importer.metadata[0]) + assert results == [Relationship(importer.UUID, supplier.UUID, "Uses")] + + +def test_phase_2_legacy_path_match(): + """ + Phase 2: Match based on installPath + fileName fallback + """ + sbom = SBOM() + + supplier = Software( + UUID="uuid-supplier", + fileName=["HelloWorld.class"], + installPath=["/app/classes/com/example/HelloWorld.class"], + metadata=[ + {"javaClasses": {"com.example.HelloWorld": {"javaExports": ["com.example.HelloWorld"]}}} + ], + ) + + importer = Software( + UUID="uuid-importer", + installPath=["/other/bin/app.jar"], + metadata=[ + {"javaClasses": {"com.example.Main": {"javaImports": ["com.example.HelloWorld"]}}} + ], + ) + + sbom.add_software(supplier) + sbom.add_software(importer) + + results = java_relationship.establish_relationships(sbom, importer, importer.metadata[0]) + + assert results is not None + assert results == [Relationship(importer.UUID, supplier.UUID, "Uses")] + + +def test_no_match_returns_empty(): + """ + Validate that no relationship is returned when no match is possible + """ + sbom = SBOM() + + supplier = Software( + UUID="uuid-supplier", fileName=["Other.class"], installPath=["/somewhere/Other.class"] + ) + + importer = Software( + UUID="uuid-importer", + installPath=["/bin/app.jar"], + metadata=[ + {"javaClasses": {"com.example.Main": {"javaImports": ["com.example.HelloWorld"]}}} + ], + ) + + sbom.add_software(supplier) + sbom.add_software(importer) + + results = java_relationship.establish_relationships(sbom, importer, importer.metadata[0]) + + assert results == [] diff --git a/tests/relationships/test_pe_relationship.py b/tests/relationships/test_pe_relationship.py new file mode 100644 index 000000000..596426105 --- /dev/null +++ b/tests/relationships/test_pe_relationship.py @@ -0,0 +1,220 @@ +# pylint: disable=redefined-outer-name +import pathlib + +import pytest + +from surfactant.relationships import pe_relationship +from surfactant.sbomtypes import SBOM, Relationship, Software + + +@pytest.fixture +def basic_pe_sbom(): + """ + Create a minimal SBOM with: + - One binary (UUID: uuid-bin) located in C:/bin + - One DLL (UUID: uuid-dll) located in C:/bin + - The binary declares a direct PE import of 'foo.dll' + + Returns: + Tuple[SBOM, Software, Software]: the SBOM, the binary, and the DLL + """ + sbom = SBOM() + + dll = Software( + UUID="uuid-dll", + fileName=["foo.dll"], + installPath=["C:/bin/foo.dll"], + ) + + binary = Software( + UUID="uuid-bin", + installPath=["C:/bin/app.exe"], + metadata=[{"peImport": ["foo.dll"]}], + ) + + # Add both software components to the SBOM + sbom.add_software(dll) + sbom.add_software(binary) + + return sbom, binary, dll + + +def test_pe_import_via_fs_tree(basic_pe_sbom): + """ + Test that a PE import is resolved correctly via fs_tree-based path matching. + """ + sbom, binary, dll = basic_pe_sbom + + results = pe_relationship.establish_relationships(sbom, binary, binary.metadata[0]) + + assert results is not None + assert len(results) == 1 + assert results[0] == Relationship(binary.UUID, dll.UUID, "Uses") + + +def test_pe_import_legacy_fallback(): + """ + Test that PE relationship fallback works when fs_tree does not contain the path. + It should fall back to installPath + fileName matching. + """ + sbom = SBOM() + + dll = Software( + UUID="uuid-dll", + fileName=["bar.dll"], + installPath=["D:/tools/bar.dll"], + ) + + binary = Software( + UUID="uuid-bin", + installPath=["D:/tools/app.exe"], + metadata=[{"peBoundImport": ["bar.dll"]}], + ) + + sbom.add_software(dll) + sbom.add_software(binary) + + results = pe_relationship.establish_relationships(sbom, binary, binary.metadata[0]) + + assert results is not None + assert results == [Relationship("uuid-bin", "uuid-dll", "Uses")] + + +def test_pe_same_directory_match(): + """ + Verify that a DLL with a matching fileName in the importer's directory is resolved. + + Note: + - This will typically resolve in Phase 1 (fs_tree exact path). If fs_tree were + unavailable for the exact path, the resolver’s fallback also matches by + fileName + shared directory. (In the current resolver, Phase 2 and Phase 3 + both use that criterion.) + """ + sbom = SBOM() + + dll = Software( + UUID="uuid-dll", + fileName=["common.dll"], + installPath=["E:/bin/common.dll"], # <== change this + ) + + binary = Software( + UUID="uuid-bin", + fileName=["app"], + installPath=["E:/bin/app.exe"], + metadata=[{"peDelayImport": ["common.dll"]}], + ) + + sbom.add_software(dll) + sbom.add_software(binary) + + results = pe_relationship.establish_relationships(sbom, binary, binary.metadata[0]) + + # extra sanity check on normalized parent dir + assert pathlib.PurePosixPath("E:/bin/common.dll").parent.as_posix() == "E:/bin" + assert results is not None + assert results == [Relationship("uuid-bin", "uuid-dll", "Uses")] + + +def test_pe_no_match(): + """ + Ensure no relationship is emitted if the imported DLL cannot be resolved + through any mechanism (fs_tree, legacy, or heuristic). + """ + sbom = SBOM() + + dll = Software( + UUID="uuid-dll", + fileName=["missing.dll"], + installPath=["Z:/opt/ghost.dll"], + ) + + binary = Software( + UUID="uuid-bin", + installPath=["Z:/opt/app.exe"], + metadata=[{"peImport": ["doesnotexist.dll"]}], + ) + + sbom.add_software(dll) + sbom.add_software(binary) + + results = pe_relationship.establish_relationships(sbom, binary, binary.metadata[0]) + + assert results == [] + + +def test_pe_has_required_fields(): + """ + Unit test for has_required_fields(): ensure it returns True only if at least + one valid PE field is present in the metadata. + """ + assert pe_relationship.has_required_fields({"peImport": ["foo.dll"]}) + assert pe_relationship.has_required_fields({"peBoundImport": ["bar.dll"]}) + assert pe_relationship.has_required_fields({"peDelayImport": ["baz.dll"]}) + assert not pe_relationship.has_required_fields({"unrelated": []}) + + +def test_pe_no_false_positive_mismatched_basename(): + """ + Ensure the resolver does not incorrectly match a DLL name to an installPath + whose filename does not equal the imported DLL name, even if the directory + matches and fileName[] contains the imported name. + """ + sbom = SBOM() + + # Software entry claims multiple DLL names + dll = Software( + UUID="uuid-dll", + fileName=["afile.dll", "bfile.dll"], + installPath=[ + "C:/somedir/afile.dll", # in probedir, but wrong basename + "C:/anotherdir/bfile.dll", # correct basename, wrong directory + ], + ) + + binary = Software( + UUID="uuid-bin", + installPath=["C:/somedir/app.exe"], + metadata=[{"peImport": ["bfile.dll"]}], + ) + + sbom.add_software(dll) + sbom.add_software(binary) + + results = pe_relationship.establish_relationships(sbom, binary, binary.metadata[0]) + + # No relationship should be created because no installPath satisfies: + # dir == probedir AND basename == imported name + assert results == [] + + +def test_pe_case_insensitive_matching(): + """ + Verify that PE dependency resolution is case-insensitive, as required for + Windows DLL lookup semantics. The imported DLL name (`foo.dll`) differs in + case from the installed file's basename (`Foo.DLL`), but the resolver should + still match them. + """ + sbom = SBOM() + + dll = Software( + UUID="uuid-dll", + fileName=["Foo.DLL"], # DLL declared with uppercase letters + installPath=["C:/bin/Foo.DLL"], # actual installed path (Windows-style) + ) + + binary = Software( + UUID="uuid-bin", + installPath=["C:/bin/app.exe"], + metadata=[{"peImport": ["foo.dll"]}], # import uses lowercase + ) + + # Add components to the SBOM + sbom.add_software(dll) + sbom.add_software(binary) + + # Resolve PE imports + results = pe_relationship.establish_relationships(sbom, binary, binary.metadata[0]) + + # The resolver should treat basenames case-insensitively and produce a match + assert results == [Relationship("uuid-bin", "uuid-dll", "Uses")] diff --git a/tests/sbomtypes/test_fs_tree.py b/tests/sbomtypes/test_fs_tree.py new file mode 100644 index 000000000..0d721d0c8 --- /dev/null +++ b/tests/sbomtypes/test_fs_tree.py @@ -0,0 +1,188 @@ +# pylint: disable=redefined-outer-name +import pytest + +from surfactant.sbomtypes import SBOM, Software + + +@pytest.fixture +def software_entries(): + return [ + Software(UUID="uuid-1", installPath=["/usr/bin/ls"]), + Software(UUID="uuid-2", installPath=["/usr/lib/libc.so"]), + Software(UUID="uuid-3", installPath=["/opt/tools/bin/run"]), + ] + + +def test_fs_tree_population(software_entries): + """ + Verify that SBOM._add_software_to_fs_tree() builds the filesystem tree correctly. + + Expectations: + - Each installPath is normalized and inserted into fs_tree. + - All intermediate directory nodes are created (e.g., /usr, /usr/bin). + - Edges reflect parent→child directory hierarchy. + - Leaf nodes (final installPath) are tagged with the correct software_uuid. + """ + sbom = SBOM() + for sw in software_entries: + sbom.add_software(sw) # triggers _add_software_to_fs_tree() + fs = sbom.fs_tree + + # Check that expected nodes exist + assert fs.has_node("/usr") + assert fs.has_node("/usr/bin") + assert fs.has_node("/usr/bin/ls") + assert fs.has_node("/usr/lib/libc.so") + assert fs.has_node("/opt/tools/bin/run") + + # Check that edges reflect directory hierarchy + assert fs.has_edge("/usr", "/usr/bin") + assert fs.has_edge("/usr/bin", "/usr/bin/ls") + assert fs.has_edge("/usr/lib", "/usr/lib/libc.so") + assert fs.has_edge("/opt", "/opt/tools") + assert fs.has_edge("/opt/tools/bin", "/opt/tools/bin/run") + + # Check software UUID tagging + assert fs.nodes["/usr/bin/ls"]["software_uuid"] == "uuid-1" + assert fs.nodes["/usr/lib/libc.so"]["software_uuid"] == "uuid-2" + assert fs.nodes["/opt/tools/bin/run"]["software_uuid"] == "uuid-3" + + +def test_get_software_by_path(software_entries): + """ + Verify SBOM.get_software_by_path() returns the correct Software object. + + Expectations: + - For a valid installPath, it returns the matching Software (by UUID). + - For a missing path, it returns None. + - Normalization ensures consistent lookups (tested separately). + """ + sbom = SBOM() + for sw in software_entries: + sbom.add_software(sw) + + sw1 = sbom.get_software_by_path("/usr/bin/ls") + sw2 = sbom.get_software_by_path("/opt/tools/bin/run") + sw_invalid = sbom.get_software_by_path("/nonexistent") + + assert sw1.UUID == "uuid-1" + assert sw2.UUID == "uuid-3" + assert sw_invalid is None + + +def test_fs_tree_windows_normalization(): + r""" + Ensure Windows-style paths are normalized into POSIX form in fs_tree. + + Behavior: + - SBOM.add_software() converts r"C:\Tools\bin\run.exe" to "C:/Tools/bin/run.exe". + - get_software_by_path() should resolve with either Windows-style backslashes + or normalized POSIX-style slashes. + """ + sbom = SBOM() + win_sw = Software(UUID="uuid-win", installPath=[r"C:\Tools\bin\run.exe"]) + sbom.add_software(win_sw) + + # Node should exist using POSIX-normalized form + assert sbom.fs_tree.has_node("C:/Tools/bin/run.exe") + # Lookup should work with either style + assert sbom.get_software_by_path(r"C:\Tools\bin\run.exe").UUID == "uuid-win" + assert sbom.get_software_by_path("C:/Tools/bin/run.exe").UUID == "uuid-win" + + +def test_get_software_by_path_symlink_traversal(): + """ + Verify get_software_by_path() can resolve symlinks in fs_tree. + + Cases: + - Single-hop alias: /lib/libc.so.6 → /usr/lib/libc.so.6 + - Multi-hop alias: /lib/libc.so → /lib/libc.so.6 → /usr/lib/libc.so.6 + + Expected: + - Both /lib/libc.so.6 and /lib/libc.so resolve to the target Software at + /usr/lib/libc.so.6. + """ + sbom = SBOM() + target = Software(UUID="uuid-libc", installPath=["/usr/lib/libc.so.6"]) + sbom.add_software(target) + + # Single-hop alias + sbom.record_symlink("/lib/libc.so.6", "/usr/lib/libc.so.6", subtype="file") + assert sbom.get_software_by_path("/lib/libc.so.6").UUID == "uuid-libc" + + # Multi-hop alias + sbom.record_symlink("/lib/libc.so", "/lib/libc.so.6", subtype="file") + assert sbom.get_software_by_path("/lib/libc.so").UUID == "uuid-libc" + + +def test_get_software_by_path_symlink_cycle_guard(): + """ + Ensure symlink cycles do not cause infinite loops. + + Case: + - /a → /b + - /b → /a + - Neither node has a software_uuid tag. + + Expected: + - get_software_by_path("/a") returns None gracefully without recursion errors. + """ + sbom = SBOM() + sbom.record_symlink("/a", "/b", subtype="file") + sbom.record_symlink("/b", "/a", subtype="file") + assert sbom.get_software_by_path("/a") is None + + +def test_to_dict_override_filters_path_edges(): + """ + Confirm that to_dict_override() excludes filesystem-only edges. + + Steps: + - Add a Software with installPath=/usr/bin/ls. + - Record a symlink /bin/ls → /usr/bin/ls (creates Path nodes and symlink edge). + - Call to_dict_override(). + + Expected: + - 'graph' and 'fs_tree' internals are absent from the output dict. + - No relationship entries involve '/bin/ls' or '/usr/bin/ls' path nodes. + """ + sbom = SBOM() + sw = Software(UUID="uuid-1", installPath=["/usr/bin/ls"]) + sbom.add_software(sw) + + sbom.record_symlink("/bin/ls", "/usr/bin/ls", subtype="file") + data = sbom.to_dict_override() + + assert "graph" not in data and "fs_tree" not in data + for rel in data.get("relationships", []): + assert rel["xUUID"] != "/bin/ls" and rel["yUUID"] != "/usr/bin/ls" + + +def test_expand_pending_dir_symlinks_creates_chained_edges(tmp_path): + """ + Verify that expand_pending_dir_symlinks() synthesizes one-hop chained edges + for deferred directory symlinks. + + Scenario: + dirE/link_to_F → dirF + dirF/runthat → /bin/echo + + Expected: + After expansion, fs_tree contains: + /dirE/link_to_F/runthat → /dirF/runthat + """ + sbom = SBOM() + + # Create base structure + sbom.fs_tree.add_edge("/usr/bin", "/usr/bin/dirE") + sbom.fs_tree.add_edge("/usr/bin", "/usr/bin/dirF") + sbom.fs_tree.add_edge("/usr/bin/dirF", "/usr/bin/dirF/runthat") + + # Record directory symlink (deferred expansion) + sbom.record_symlink("/usr/bin/dirE/link_to_F", "/usr/bin/dirF", subtype="directory") + + # Expand queued directory symlinks + sbom.expand_pending_dir_symlinks() + + # Verify synthetic chained edge was created + assert sbom.fs_tree.has_edge("/usr/bin/dirE/link_to_F/runthat", "/usr/bin/dirF/runthat") diff --git a/tests/symlink/test_resolve_links.py b/tests/symlink/test_resolve_links.py deleted file mode 100644 index 3136b7696..000000000 --- a/tests/symlink/test_resolve_links.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2023 Lawrence Livermore National Security, LLC -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT -import os -import pathlib -import tempfile - -import pytest - -from surfactant.cmd.generate import resolve_link - -base_dir = pathlib.Path(__file__).parent.absolute() - - -def symlink(src, dst, target_is_directory): - try: - os.symlink(src, dst, target_is_directory) - except FileExistsError: - pass - - -def create_symlinks(temp_dir): - # Make sure this is always the working directory - os.chdir(base_dir) - os.makedirs(os.path.join(temp_dir, "test_dir", "subdir"), exist_ok=True) - os.chdir(os.path.join(temp_dir, "test_dir")) - symlink("..", "parent", True) - symlink("parent", "link_to_parent", True) - symlink("/none/", "does_not_exist", True) - symlink("does_not_exist", "link_to_non_existant", False) - symlink("..", "subdir/parent", True) - # Revert back to the original working directory - os.chdir(base_dir) - - -@pytest.mark.skipif(os.name != "posix", reason="requires posix os") -def test_symlinks(): - with tempfile.TemporaryDirectory() as temp_dir: - create_symlinks(temp_dir) - base_path = os.path.realpath(os.path.join(temp_dir, "test_dir")) - assert resolve_link(os.path.join(base_path, "parent"), base_path, base_path) == base_path - assert ( - resolve_link(os.path.join(base_path, "link_to_parent"), base_path, base_path) - == base_path - ) - assert resolve_link(os.path.join(base_path, "does_not_exist"), base_path, base_path) is None - assert ( - resolve_link( - os.path.join(base_path, "subdir", "parent"), - os.path.join(base_path, "subdir"), - base_path, - ) - == base_path - ) - assert ( - resolve_link(os.path.join(base_path, "link_to_non_existant"), base_path, base_path) - is None - ) - - -if __name__ == "__main__": - test_symlinks() diff --git a/tests/utils/test_paths.py b/tests/utils/test_paths.py new file mode 100644 index 000000000..31f06cfe4 --- /dev/null +++ b/tests/utils/test_paths.py @@ -0,0 +1,68 @@ +from pathlib import PurePosixPath + +from surfactant.utils.paths import normalize_path + + +def test_single_string_path(): + """Normalize a single Windows-style path string to POSIX format.""" + assert normalize_path("C:\\Program Files\\App") == "C:/Program Files/App" + + +def test_multiple_parts(): + """Join multiple path parts into a normalized POSIX-style path.""" + assert normalize_path("C:", "Program Files", "App") == "C:/Program Files/App" + + +def test_with_purepath(): + """Combine a PurePosixPath with a string part and normalize the result.""" + assert normalize_path(PurePosixPath("C:/Program Files"), "App") == "C:/Program Files/App" + + +def test_trailing_slash_is_preserved(): + """Strip trailing slashes from non-root POSIX paths.""" + assert normalize_path("C:/App/") == "C:/App" # PosixPath strips trailing slashes + + +def test_empty_parts(): + """Normalize an empty string to the current directory ('.').""" + assert normalize_path("") == "." + + +def test_pureposixpath_with_literal_backslash(): + """If given a PurePosixPath, a literal backslash should be preserved.""" + path = PurePosixPath("foo\\bar") # backslash is part of the filename + result = normalize_path(path) + assert result == "foo\\bar" # ensure it's not replaced with a forward slash + + +def test_pureposixpath_mixed_with_string(): + """When mixing a PurePosixPath and a string, only the string parts are cleaned.""" + path = PurePosixPath("foo\\bar") + result = normalize_path(path, "baz\\qux") + # The PurePosixPath part should keep its backslash, the string part should be normalized + assert result == "foo\\bar/baz/qux" + + +def test_no_arguments_returns_dot(): + """normalize_path() with no arguments should return '.'""" + assert normalize_path() == "." + + +def test_absolute_path_overrides_previous_parts(): + """Absolute path parts should override earlier parts, matching pathlib semantics.""" + assert normalize_path("/root", "/etc/passwd") == "/etc/passwd" + + +def test_redundant_slashes_are_collapsed(): + """Multiple slashes between parts should collapse into a single slash.""" + assert normalize_path("foo//bar", "baz") == "foo/bar/baz" + + +def test_dot_and_dotdot_not_resolved(): + """Relative navigation components should be preserved (no resolution).""" + assert normalize_path("foo/../bar") == "foo/../bar" + + +def test_trailing_slash_in_middle_part_is_ignored(): + """Trailing slashes in intermediate parts should not affect joining.""" + assert normalize_path("foo/", "bar/") == "foo/bar"