diff --git a/src/dda/build/__init__.py b/src/dda/build/__init__.py new file mode 100644 index 00000000..79ca6026 --- /dev/null +++ b/src/dda/build/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT diff --git a/src/dda/build/metadata/__init__.py b/src/dda/build/metadata/__init__.py new file mode 100644 index 00000000..79ca6026 --- /dev/null +++ b/src/dda/build/metadata/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT diff --git a/src/dda/build/metadata/digests.py b/src/dda/build/metadata/digests.py new file mode 100644 index 00000000..16c79e48 --- /dev/null +++ b/src/dda/build/metadata/digests.py @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT + +from __future__ import annotations + +from enum import StrEnum, auto +from typing import TYPE_CHECKING, override + +from msgspec import Struct + +from dda.utils.fs import Path + +if TYPE_CHECKING: + from dda.cli.application import Application + + +class DigestType(StrEnum): + """ + The type of digest for a build artifact, i.e. the possible values for the `digest` field in the BuildMetadata struct. + """ + + # Digest applicable to files + # Suport only SHA256 for now + FILE_SHA256 = auto() + + # Digest applicable to OCI container images + # Use the OCI image digest format (result of `docker image inspect --format '{{.Id}}'`) + OCI_DIGEST = auto() + + # Other digest types - used for non-standard digest types + OTHER = auto() + + @classmethod + @override + def _missing_(cls, value: object) -> DigestType: + # TODO: Add a warning here probably + return cls.OTHER + + def calculate_digest(self, app: Application, artifact_spec: Path | str) -> ArtifactDigest: + match self: + case DigestType.FILE_SHA256: + digest_value = Path(artifact_spec).hexdigest(algorithm="sha256") + case DigestType.OCI_DIGEST: + digest_value = app.tools.docker.get_image_digest(str(artifact_spec)) + case _: + msg = f"Cannot calculate digest for digest type: {self}" + raise NotImplementedError(msg) + + return ArtifactDigest(value=digest_value, type=self) + + +class ArtifactDigest(Struct, frozen=True): + """ + A digest for a build artifact. + """ + + value: str + type: DigestType + + def __post_init__(self) -> None: + # Validate the digest value for the given digest type + match self.type: + case DigestType.FILE_SHA256: + check_valid_sha256_digest(self.value) + case DigestType.OCI_DIGEST: + if not self.value.startswith("sha256:"): + msg = f"OCI digest value must start with 'sha256:': {self.value}" + raise ValueError(msg) + check_valid_sha256_digest(self.value.removeprefix("sha256:")) + case DigestType.OTHER: + # TODO: Maybe warning here + pass + case _: + msg = f"Unsupported digest type: {self.type}" + raise NotImplementedError(msg) + + +def check_valid_sha256_digest(digest: str) -> None: + """ + Check if the digest is a valid SHA256 digest. + """ + if not (all(x in "0123456789abcdef" for x in digest) and len(digest) == 64): # noqa: PLR2004 + msg = f"Value is not a valid SHA256 digest: {digest}" + raise ValueError(msg) diff --git a/src/dda/build/metadata/formats.py b/src/dda/build/metadata/formats.py new file mode 100644 index 00000000..3fd738f3 --- /dev/null +++ b/src/dda/build/metadata/formats.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT + +from __future__ import annotations + +from enum import StrEnum, auto +from typing import override + +from msgspec import Struct + +from dda.build.metadata.digests import DigestType + + +class ArtifactFormat(StrEnum): + """ + Enum of all available artifact formats. + """ + + # Distribution formats + DEB = auto() + RPM = auto() + MSI = auto() + OCI = auto() + + # Format for binary artifacts + BIN = auto() + + # Other formats - used for non-standard artifact formats + OTHER = auto() + + @classmethod + @override + def _missing_(cls, value: object) -> ArtifactFormat: + # TODO: Add a warning here probably + return cls.OTHER + + # Properties for the underlying format details + @property + def type(self) -> ArtifactType: + return _ARTIFACT_FORMAT_DETAILS[self.name].artifact_type + + @property + def digest_type(self) -> DigestType: + return _ARTIFACT_FORMAT_DETAILS[self.name].digest_type + + def get_file_identifier(self) -> str: + return _ARTIFACT_FORMAT_DETAILS[self.name].file_identifier + + +class ArtifactType(StrEnum): + """ + The type of a build artifact: either a "binary" or a "distribution". + Each artifact type has a corresponding format enum. + """ + + BIN = auto() + DIST = auto() + OTHER = auto() + + @classmethod + @override + def _missing_(cls, value: object) -> ArtifactType: + # TODO: Add a warning here probably + return cls.OTHER + + +class _ArtifactFormatDetails(Struct, frozen=True): + """ + Details of an artifact format. + This implementation (with a string indexing a dict of details) is used to workaround msgspec's encoding logic for Enums, which cannot handle complex objects. + All valid formats are registered in the ArtifactFormat enum. + """ + + file_identifier: str + artifact_type: ArtifactType + digest_type: DigestType + + +_ARTIFACT_FORMAT_DETAILS: dict[str, _ArtifactFormatDetails] = { + "DEB": _ArtifactFormatDetails( + file_identifier=".deb", artifact_type=ArtifactType.DIST, digest_type=DigestType.FILE_SHA256 + ), + "RPM": _ArtifactFormatDetails( + file_identifier=".rpm", artifact_type=ArtifactType.DIST, digest_type=DigestType.FILE_SHA256 + ), + "MSI": _ArtifactFormatDetails( + file_identifier=".msi", artifact_type=ArtifactType.DIST, digest_type=DigestType.FILE_SHA256 + ), + "OCI": _ArtifactFormatDetails( + file_identifier="-oci.tar.gz", artifact_type=ArtifactType.DIST, digest_type=DigestType.OCI_DIGEST + ), + "BIN": _ArtifactFormatDetails( + file_identifier="", artifact_type=ArtifactType.BIN, digest_type=DigestType.FILE_SHA256 + ), + "OTHER": _ArtifactFormatDetails( + file_identifier=".unknown", artifact_type=ArtifactType.OTHER, digest_type=DigestType.OTHER + ), +} + +# Validate that the keys of the _ArtifactFormatDetails dict match the keys of the ArtifactFormat enum +# Better to do this here than as a test +if _ARTIFACT_FORMAT_DETAILS.keys() != ArtifactFormat.__members__.keys(): + msg = "ArtifactFormat and _ArtifactFormatDetails keys do not match" + raise ValueError(msg) diff --git a/src/dda/build/metadata/metadata.py b/src/dda/build/metadata/metadata.py new file mode 100644 index 00000000..27add46c --- /dev/null +++ b/src/dda/build/metadata/metadata.py @@ -0,0 +1,282 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT + +from __future__ import annotations + +from datetime import datetime +from typing import TYPE_CHECKING, Any +from uuid import UUID # noqa: TC003 - needed outside of typecheck for msgspec decode + +from msgspec import Struct + +from dda.build.metadata.digests import ArtifactDigest # noqa: TC001 - needed outside of typecheck for msgspec decode +from dda.build.metadata.formats import ArtifactFormat, ArtifactType +from dda.build.metadata.platforms import OS, Arch, Platform +from dda.utils.fs import Path +from dda.utils.git.changeset import ChangeSet # noqa: TC001 - needed outside of typecheck for msgspec decode +from dda.utils.git.commit import Commit # noqa: TC001 + +if TYPE_CHECKING: + from click import Context + + from dda.cli.application import Application + + +class BuildMetadata(Struct, frozen=True): + """ + Metadata about a build that can be used to identify it. + """ + + # Basic fields + id: UUID + agent_components: set[str] + artifact_format: ArtifactFormat + + # Source tree fields + commit: Commit + worktree_diff: ChangeSet + + # Compatibility fields + compatible_platforms: set[Platform] + + # Build metadata + build_platform: Platform + build_time: datetime + + # Artifact-related fields + digest: ArtifactDigest + + def __post_init__(self) -> None: + if self.artifact_type == ArtifactType.BIN and len(self.agent_components) != 1: + msg = "An agent component artifact can only represent a single component" + raise ValueError(msg) + + os_set = {platform.os for platform in self.compatible_platforms} + arch_set = {platform.arch for platform in self.compatible_platforms} + if OS.ANY in os_set and len(os_set) > 1: + msg = "Cannot use both the 'any' OS and other OSs in compatible platforms" + raise ValueError(msg) + if Arch.ANY in arch_set and len(arch_set) > 1: + msg = "Cannot have both any architecture and other architectures in compatible platforms" + raise ValueError(msg) + + @property + def artifact_type(self) -> ArtifactType: + """The artifact type.""" + return self.artifact_format.type + + @classmethod + def spawn_from_context( + cls, + context_data: _MetadataRequiredContext, + artifact_digest: ArtifactDigest, + ) -> BuildMetadata: + """ + Create a BuildMetadata instance for the current build. + Takes two arguments: + - context_data: A _MetadataRequiredContext instance containing the required data to generate build metadata. + This can be created with the `analyze_context` function. + - artifact_digest: An ArtifactDigest instance containing the digest of the artifact. + This can be calculated with the `DigestType.calculate_digest` method. + For example, starting from the _MetadataRequiredContext instance, you can do: + ```python + context_details: _MetadataRequiredContext = analyze_context(ctx, app) + artifact_digest = context_details.artifact_format.digest_type.calculate_digest( + app, + ) + ``` + The _MetadataRequiredContext instance can have its fields overridden if the defaults obtained from the context are not correct. + Returns: + A fully initialized BuildMetadata instance. The `id` and `build_time` fields are automatically generated. + """ + + # Current time and ID + build_time = datetime.now() # noqa: DTZ005 + artifact_id = generate_build_id() + + return cls( + id=artifact_id, + build_time=build_time, + digest=artifact_digest, + **context_data.dump(), + ) + + def to_file(self, path: Path | None = None) -> None: + """ + Write the build metadata to a file. + """ + from msgspec.json import encode + + from dda.types.hooks import enc_hook + + if path is None: + path = Path(f"{self.get_canonical_filename()}.json") + + path.write_atomic(encode(self, enc_hook=enc_hook), "wb") + + @classmethod + def from_file(cls, path: Path) -> BuildMetadata: + """ + Read the build metadata from a file. + """ + from msgspec.json import decode + + from dda.types.hooks import dec_hook + + return decode(path.read_text(encoding="utf-8"), type=cls, dec_hook=dec_hook) + + def get_canonical_filename(self) -> str: + """ + Get a predictable filename corresponding to the artifact represented by this metadata. + Schema is: `{components}-{compatibility}-{source info}-{short uuid}-{artifact format identifier}`. + Where: + - `components` is the name of the agent component. + For components named `x-agent`, we will use `x` instead, omitting the `-agent` suffix. + Any `-` characters will be replaced with `_`. + If there are multiple components, a comma-separated list will be used, sorted alphabetically. + - `compatibility` is a platform identifier, e.g. `linux-arm64`. + If there are multiple compatible platforms, the string `many` will be used instead. + If the platform compatibility is `any, any`, the string `any` will be used instead. + - `source_info` is the short commit SHA, appended with `+{worktree diff hash}` if there are any working tree changes. + - `short uuid` is the first section of the UUID attributed to the artifact. + - `artifact_format_identifier` gives info on the contents of the artifact when it is a dist. See `ArtifactFormat.get_file_identifier` for more details. + NOTE: For binaries, we do not use `.bin`, instead leaving the file extension blank. + + Returns: + A predictable filename corresponding to the artifact represented by this metadata. + """ + # TODO: Discuss this schema in a document, this is a first idea + + # Components + components = ",".join( + sorted(component.replace("-agent", "").replace("-", "_") for component in self.agent_components) + ) + + # Short UUID + short_uuid = self.id.urn.split(":")[2].split("-")[0] + + # Source info + source_info = self.commit.sha1[:8] + if self.worktree_diff.files: + source_info += f"+{self.worktree_diff.digest()[:8]}" + + # Compatibility + if Platform.ANY in self.compatible_platforms: + compatibility = "any" + elif len(self.compatible_platforms) > 1: + compatibility = "many" + else: + # Also handles the case in which os is `any` or `arch` is `any` + platform = self.compatible_platforms.copy().pop() + compatibility = str(platform) + + # Artifact format identifier + artifact_format_identifier = self.artifact_format.get_file_identifier() + return f"{components}-{compatibility}-{source_info}-{short_uuid}{artifact_format_identifier}" + + +def get_build_components(command: str) -> tuple[set[str], ArtifactFormat]: + """ + Parse calling command to get the agent components and artifact format. + + Ex: + `dda build bin core-agent` -> (`core-agent`), `bin` and `bin` + `dda build dist deb -c core-agent -c process-agent` -> (`core-agent`, `process-agent`), `dist` and `deb` + """ + command_parts = command.split(" ") + # Remove the first two parts, which are `dda` and `build`, if they exist + if command_parts[:2] != ["dda", "build"]: + msg = f"Unexpected command, only build commands can be used to extract build components: {command}" + raise ValueError(msg) + + artifact_format: ArtifactFormat + artifact_type_str = command_parts[2] + match artifact_type_str: + case "dist": + artifact_format = ArtifactFormat[command_parts[3].upper()] + # TODO: Implement this in a more robust way, write a proper parser for the command line + agent_components = {part for part in command_parts[4:] if part != "-c"} + case "bin": + artifact_format = ArtifactFormat.BIN + agent_components = {command_parts[3]} + case _: + msg = f"Unsupported artifact type: {artifact_type_str}" + raise NotImplementedError(msg) + + return agent_components, artifact_format + + +def generate_build_id() -> UUID: + """ + Generate a unique build ID. + """ + from uuid import uuid4 + + return uuid4() + + +def analyze_context(ctx: Context, app: Application) -> _MetadataRequiredContext: + """ + Analyze the context to get the required data to generate build metadata. + """ + return _MetadataRequiredContext.from_context(ctx, app) + + +class _MetadataRequiredContext(Struct): + """ + Collection of fields obtained from build context to generate build metadata. + Having this as a separate struct allows for easier overriding - this struct is explicitely not frozen. + """ + + agent_components: set[str] + artifact_format: ArtifactFormat + + # Source tree fields + commit: Commit + worktree_diff: ChangeSet + + # Compatibility fields + compatible_platforms: set[Platform] + + # Build metadata + build_platform: Platform + + @classmethod + def from_context(cls, ctx: Context, app: Application) -> _MetadataRequiredContext: + """ + Create a _MetadataRequiredContext instance from the application and build context. + Some values might not be correct for some artifacts, in which case they should be overridden afterwards. + + Defaults: + - agent_components: Extracted from the calling command for DIST artifacts, or set to a single component for BIN artifacts. + - artifact_format: Extracted from the calling command. + - commit: The HEAD commit of the currently checked-out repo. + - worktree_diff: The changes in the working tree compared to HEAD. + - compatible_platforms: Defaults to a singleton of the build platform. This can be overridden to add more compatible platforms if possible. + - build_platform: The platform this code is being run on. + """ + + import platform + + # Build components + build_components = get_build_components(ctx.command_path) + agent_components, artifact_format = build_components + + # Build platform + build_platform = Platform.from_alias(platform.system().lower(), platform.machine()) + + return cls( + agent_components=agent_components, + artifact_format=artifact_format, + commit=app.tools.git.get_commit(), + worktree_diff=app.tools.git.get_changes("HEAD", start="HEAD", working_tree=True), + compatible_platforms={build_platform}, + build_platform=build_platform, + ) + + def dump(self) -> dict[str, Any]: + """ + Dump the context data to a dictionary. + """ + return {k: getattr(self, k) for k in self.__struct_fields__} diff --git a/src/dda/build/metadata/platforms.py b/src/dda/build/metadata/platforms.py new file mode 100644 index 00000000..3c970b0e --- /dev/null +++ b/src/dda/build/metadata/platforms.py @@ -0,0 +1,101 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT + +from enum import StrEnum, auto +from typing import ClassVar + +from msgspec import Struct + + +class OS(StrEnum): + """ + The operating system for which the build artifact is intended. + """ + + LINUX = auto() + WINDOWS = auto() + MACOS = auto() + + # Any OS - used for indicating compatibility with any operating system + ANY = auto() + + @classmethod + def from_alias(cls, alias: str) -> "OS": + """ + Get the OS enum value from an alias. + """ + match alias.lower(): + case "linux": + return cls.LINUX + case "windows" | "nt" | "win": + return cls.WINDOWS + case "macos" | "darwin" | "osx": + return cls.MACOS + case "any": + return cls.ANY + case _: + msg = f"Invalid OS identifier: {alias}" + raise ValueError(msg) + + +class Arch(StrEnum): + """ + The CPU architecture for which the build artifact is intended. + """ + + # x86 architectures - canonical name is amd64 + AMD64 = auto() + + # ARM architectures - canonical name is arm64 + ARM64 = auto() + + # ARMHF architectures + ARMHF = auto() + + # Any architecture - used for indicating compatibility with any CPU architecture + ANY = auto() + + @classmethod + def from_alias(cls, alias: str) -> "Arch": + """ + Get the Arch enum value from an alias. + """ + match alias.lower(): + case "amd64" | "x86_64" | "x86-64" | "x86" | "x64": + return cls.AMD64 + case "arm64" | "aarch64" | "arm" | "aarch": + return cls.ARM64 + case "any": + return cls.ANY + case _: + msg = f"Invalid Arch identifier: {alias}" + raise ValueError(msg) + + +class Platform(Struct, frozen=True): + """ + The platform for which the build artifact is intended. + """ + + os: OS + arch: Arch + + ANY: ClassVar["Platform"] + + @classmethod + def from_alias(cls, os_alias: str, arch_alias: str) -> "Platform": + """ + Get the Platform enum value from an alias. + """ + return cls(os=OS.from_alias(os_alias), arch=Arch.from_alias(arch_alias)) + + def __str__(self) -> str: + """ + Get the string representation of the platform. + """ + return f"{self.os}-{self.arch}" + + +# Initialize the ANY class variable after the class is fully defined +Platform.ANY = Platform(os=OS.ANY, arch=Arch.ANY) diff --git a/src/dda/tools/docker.py b/src/dda/tools/docker.py index b4b26971..5ac69454 100644 --- a/src/dda/tools/docker.py +++ b/src/dda/tools/docker.py @@ -34,3 +34,6 @@ def path(self) -> str: import shutil return shutil.which("docker") or shutil.which("podman") or "docker" + + def get_image_digest(self, image_spec: str) -> str: + return self.capture(["image", "inspect", image_spec, "--format", "{{.Id}}"]) diff --git a/src/dda/utils/fs.py b/src/dda/utils/fs.py index 9c8eda3a..cb84481e 100644 --- a/src/dda/utils/fs.py +++ b/src/dda/utils/fs.py @@ -154,6 +154,30 @@ def __as_exe(self) -> Path: def __as_exe(self) -> Path: return self + def hexdigest(self, *, algorithm: str = "sha256", buffer_size: int | None = None) -> str: + """ + Returns a hexadecimal string representation of the file's digest. + The algorithm used can be any algorithm supported by hashlib, as defined in hashlib. Defaults to sha256. + Will raise FileNotFoundError if the path does not exist, and OSError if it is not a file. + + Parameters: + algorithm: The hashing algorithm to use to calculate the digest, specified as a string. Must be a member of hashlib.algorithms_guaranteed. + buffer_size: The size, in bytes, of the in-memory buffer to use for reading the file while hashing. Defaults to None, which lets hashlib choose the buffer size. + """ + import hashlib + + # Note: algorithms_guaranteed is a subset of algorithms_available, we use it to avoid letting people use platform-specific algorithms. + if algorithm not in hashlib.algorithms_guaranteed: + msg = f"Invalid hashing algorithm `{algorithm}` requested. Must be one of {', '.join(hashlib.algorithms_guaranteed)}." + raise ValueError(msg) + + bufsize_arg = {} if buffer_size is None else {"_bufsize": buffer_size} + + with self.open("rb") as f: + digest = hashlib.file_digest(f, algorithm, **bufsize_arg) + + return digest.hexdigest() + @contextmanager def temp_directory() -> Generator[Path, None, None]: @@ -190,6 +214,7 @@ def temp_file(suffix: str = "") -> Generator[Path, None, None]: from tempfile import NamedTemporaryFile with NamedTemporaryFile(suffix=suffix) as f: + f.close() # NamedTemporaryFile returns a file descriptor, not a path. We close it as, on Windows, it is not possible to open a file descriptor twice. yield Path(f.name).resolve() diff --git a/src/dda/utils/git/changeset.py b/src/dda/utils/git/changeset.py index 4dc2e0e8..4baf2306 100644 --- a/src/dda/utils/git/changeset.py +++ b/src/dda/utils/git/changeset.py @@ -69,7 +69,7 @@ def paths(self) -> MappingProxyType[str, ChangedFile]: return self.__changed @property - def files(self) -> Iterable[ChangedFile]: + def files(self) -> tuple[ChangedFile, ...]: return self.__files @property diff --git a/tests/build/__init__.py b/tests/build/__init__.py new file mode 100644 index 00000000..79ca6026 --- /dev/null +++ b/tests/build/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT diff --git a/tests/build/fixtures/build_metadata.json b/tests/build/fixtures/build_metadata.json new file mode 100644 index 00000000..fdd40199 --- /dev/null +++ b/tests/build/fixtures/build_metadata.json @@ -0,0 +1,49 @@ +{ + "id": "00000000-0000-0000-0000-123456780000", + "agent_components": [ + "trace-agent", + "core-agent" + ], + "artifact_format": "rpm", + "commit": { + "sha1": "1234567890123456789012345678901234567890", + "author": { + "name": "John Doe", + "email": "john.doe@example.com", + "timestamp": 1717171717 + }, + "committer": { + "name": "John Doe", + "email": "john.doe@example.com", + "timestamp": 1717171717 + }, + "message": "test" + }, + "worktree_diff": [ + { + "path": "test.txt", + "type": "A", + "patch": "@@ -0,0 +1 @@\n+test", + "binary": false + } + ], + "compatible_platforms": [ + { + "os": "linux", + "arch": "amd64" + }, + { + "os": "macos", + "arch": "arm64" + } + ], + "build_platform": { + "os": "macos", + "arch": "arm64" + }, + "build_time": "2025-09-16T12:54:34.820949Z", + "digest": { + "type": "file_sha256", + "value": "4a23f9863c45f043ececc0f82bf95ae9e4ff377ec2bb587a61ea7a75a3621115" + } +} diff --git a/tests/build/test_metadata.py b/tests/build/test_metadata.py new file mode 100644 index 00000000..bbc156e6 --- /dev/null +++ b/tests/build/test_metadata.py @@ -0,0 +1,346 @@ +# SPDX-FileCopyrightText: 2025-present Datadog, Inc. +# +# SPDX-License-Identifier: MIT +from __future__ import annotations + +import json +import platform +from datetime import UTC, datetime +from typing import Any +from uuid import UUID + +import msgspec +import pytest + +from dda.build.metadata.digests import ArtifactDigest, DigestType +from dda.build.metadata.formats import ArtifactFormat, ArtifactType +from dda.build.metadata.metadata import BuildMetadata, analyze_context +from dda.build.metadata.platforms import OS, Arch, Platform +from dda.config.model import dec_hook, enc_hook +from dda.utils.fs import Path +from dda.utils.git.changeset import ChangedFile, ChangeSet, ChangeType +from dda.utils.git.commit import Commit, GitPersonDetails + + +@pytest.fixture +def example_author() -> GitPersonDetails: + return GitPersonDetails(name="John Doe", email="john.doe@example.com", timestamp=1717171717) + + +@pytest.fixture +def example_commit(example_author: GitPersonDetails) -> Commit: + return Commit(sha1="1234567890" * 4, author=example_author, committer=example_author, message="test") + + +@pytest.fixture +def example_build_metadata(example_commit: Commit) -> BuildMetadata: + return BuildMetadata( + id=UUID("00000000-0000-0000-0000-123456780000"), + agent_components={"core-agent", "trace-agent"}, + artifact_format=ArtifactFormat.RPM, + commit=example_commit, + compatible_platforms={Platform(OS.LINUX, Arch.AMD64), Platform(OS.MACOS, Arch.ARM64)}, + build_platform=Platform(OS.MACOS, Arch.ARM64), + build_time=datetime.fromisoformat("2025-09-16T12:54:34.820949Z"), + worktree_diff=ChangeSet([ + ChangedFile( + path=Path("test.txt"), + type=ChangeType.ADDED, + patch="@@ -0,0 +1 @@\n+test", + binary=False, + ) + ]), + digest=ArtifactDigest( + value="4a23f9863c45f043ececc0f82bf95ae9e4ff377ec2bb587a61ea7a75a3621115", type=DigestType.FILE_SHA256 + ), + ) + + +def assert_metadata_equal(metadata: BuildMetadata, expected: BuildMetadata | dict[str, Any]) -> None: + get = dict.get if isinstance(expected, dict) else getattr + + assert metadata.agent_components == get(expected, "agent_components") # type: ignore[arg-type] + assert metadata.artifact_format == get(expected, "artifact_format") # type: ignore[arg-type] + assert metadata.commit == get(expected, "commit") # type: ignore[arg-type] + assert metadata.compatible_platforms == get(expected, "compatible_platforms") # type: ignore[arg-type] + assert metadata.build_platform == get(expected, "build_platform") # type: ignore[arg-type] + assert metadata.build_time == get(expected, "build_time") # type: ignore[arg-type] + assert metadata.worktree_diff == get(expected, "worktree_diff") # type: ignore[arg-type] + + +class TestMetadata: + def test_basic(self, example_commit: Commit) -> None: + now = datetime.now(tz=UTC) + commit = example_commit + metadata = BuildMetadata( + id=UUID("00000000-0000-0000-0000-000000000000"), + agent_components={"core-agent"}, + artifact_format=ArtifactFormat.BIN, + commit=commit, + compatible_platforms={Platform(OS.LINUX, Arch.AMD64)}, + build_platform=Platform(OS.LINUX, Arch.AMD64), + build_time=now, + worktree_diff=ChangeSet({}), + digest=ArtifactDigest(value="0" * 64, type=DigestType.FILE_SHA256), + ) + expected = { + "agent_components": {"core-agent"}, + "artifact_format": ArtifactFormat.BIN, + "commit": example_commit, + "compatible_platforms": {Platform(OS.LINUX, Arch.AMD64)}, + "build_platform": Platform(OS.LINUX, Arch.AMD64), + "build_time": now, + "worktree_diff": ChangeSet({}), + "file_hash": "0" * 64, + } + assert_metadata_equal(metadata, expected) + + @pytest.mark.parametrize( + ("command_path", "expected"), + [ + ( + "dda build bin core-agent", + ({"core-agent"}, ArtifactFormat.BIN), + ), + ( + "dda build dist deb -c core-agent -c process-agent", + ( + {"core-agent", "process-agent"}, + ArtifactFormat.DEB, + ), + ), + ( + "dda build dist oci -c core-agent -c process-agent", + ( + {"core-agent", "process-agent"}, + ArtifactFormat.OCI, + ), + ), + ], + ) + def test_analyze_context(self, app, mocker, command_path, expected, example_commit): + # Expected values + expected_agent_components, expected_artifact_format = expected + build_platform = Platform.from_alias(platform.system(), platform.machine()) + expected = { + "agent_components": expected_agent_components, + "artifact_format": expected_artifact_format, + "commit": example_commit, + "compatible_platforms": {build_platform}, + "build_platform": build_platform, + "worktree_diff": ChangeSet([ + ChangedFile(path=Path("test.txt"), type=ChangeType.ADDED, patch="@@ -0,0 +1 @@\n+test", binary=False) + ]), + } + + # Setup mocks + ctx = mocker.MagicMock() + ctx.command_path = command_path + mocker.patch("dda.tools.git.Git.get_commit", return_value=expected["commit"]) + mocker.patch("dda.tools.git.Git.get_changes", return_value=expected["worktree_diff"]) + + # Test without special arguments + context_details = analyze_context(ctx, app) + + for field in expected: + assert getattr(context_details, field) == expected[field] + + @pytest.mark.parametrize( + "obj_data", + [ + { + "id": UUID("12345678-0000-0000-0000-000000000000"), + "agent_components": {"core-agent"}, + "artifact_format": ArtifactFormat.BIN, + "compatible_platforms": {Platform(OS.LINUX, Arch.AMD64)}, + "build_platform": Platform(OS.LINUX, Arch.AMD64), + "build_time": datetime.now(UTC), + "digest": ArtifactDigest(value="0" * 64, type=DigestType.FILE_SHA256), + }, + { + "id": UUID("00000000-0000-0000-0000-123456780000"), + "agent_components": {"core-agent", "trace-agent"}, + "artifact_format": ArtifactFormat.RPM, + "compatible_platforms": {Platform(OS.LINUX, Arch.AMD64), Platform(OS.MACOS, Arch.ARM64)}, + "build_platform": Platform(OS.MACOS, Arch.ARM64), + "build_time": datetime.now(UTC), + "digest": ArtifactDigest(value="0" * 64, type=DigestType.FILE_SHA256), + }, + { + "id": UUID("00000000-0000-0000-0000-123456780000"), + "agent_components": {"core-agent"}, + "artifact_format": ArtifactFormat.OTHER, + "compatible_platforms": {Platform(OS.MACOS, Arch.ARM64)}, + "build_platform": Platform(OS.MACOS, Arch.ARM64), + "build_time": datetime.now(UTC), + "digest": ArtifactDigest(value="i am a really weird digest", type=DigestType.OTHER), + }, + ], + ) + class TestEncodeDecode: + def test_basic(self, obj_data, example_commit): + obj = BuildMetadata(**obj_data, commit=example_commit, worktree_diff=ChangeSet({})) + encoded_obj = msgspec.json.encode(obj, enc_hook=enc_hook) + decoded_obj = msgspec.json.decode(encoded_obj, type=BuildMetadata, dec_hook=dec_hook) + assert_metadata_equal(decoded_obj, obj) + + def test_with_changeset(self, obj_data, example_commit): + changes = ChangeSet([ + ChangedFile( + path=Path("test.txt"), + type=ChangeType.ADDED, + patch="@@ -0,0 +1 @@\n+test", + binary=False, + ) + ]) + obj = BuildMetadata(**obj_data, commit=example_commit, worktree_diff=changes) + encoded_obj = msgspec.json.encode(obj, enc_hook=enc_hook) + decoded_obj = msgspec.json.decode(encoded_obj, type=BuildMetadata, dec_hook=dec_hook) + assert_metadata_equal(decoded_obj, obj) + + def test_to_file(self, temp_dir, example_build_metadata): + path = temp_dir / "build_metadata.json" + example_build_metadata.to_file(path) + text = path.read_text(encoding="utf-8") + + expected = (Path(__file__).parent / "fixtures" / "build_metadata.json").read_text(encoding="utf-8") + + # Compare both decoded jsons are equal - we decode to avoid being affected by key ordering or whitespace + encoded = json.loads(text) + expected = json.loads(expected) + + encoded_keys = set(encoded.keys()) + expected_keys = set(expected.keys()) + assert encoded_keys == expected_keys + for key in encoded_keys: + encoded_value, expected_value = encoded[key], expected[key] + if isinstance(encoded_value, list): + # We might have lists of dicts, so use dict.values() as a key for sorting + def sorter(x): + return list(x.values()) if isinstance(x, dict) else x + + assert sorted(encoded_value, key=sorter) == sorted(expected_value, key=sorter) + else: + assert encoded_value == expected_value + + def test_from_file(self, example_build_metadata): + path = Path(__file__).parent / "fixtures" / "build_metadata.json" + obj = BuildMetadata.from_file(path) + assert_metadata_equal(obj, example_build_metadata) + + @pytest.mark.parametrize( + ("obj_data", "expected"), + [ + ( + { + "id": UUID("db5fec1a-7fce-42e9-a29b-13a8cc6a5493"), + "agent_components": {"core-agent", "trace-agent"}, + "artifact_format": ArtifactFormat.RPM, + "compatible_platforms": {Platform(OS.LINUX, Arch.AMD64)}, + "build_platform": Platform(OS.MACOS, Arch.ARM64), + "build_time": datetime.now(UTC), + "digest": ArtifactDigest(value="0" * 64, type=DigestType.FILE_SHA256), + }, + "core,trace-linux-amd64-12345678-db5fec1a.rpm", + ), + ( + { + "id": UUID("db5fec1a-7fce-42e9-a29b-13a8cc6a5493"), + "agent_components": {"dogstatd"}, + "artifact_format": ArtifactFormat.BIN, + "compatible_platforms": {Platform(OS.LINUX, Arch.AMD64)}, + "build_platform": Platform(OS.MACOS, Arch.ARM64), + "build_time": datetime.now(UTC), + "digest": ArtifactDigest(value="0" * 64, type=DigestType.FILE_SHA256), + "worktree_diff": ChangeSet([ + ChangedFile( + path=Path("test.txt"), + type=ChangeType.ADDED, + patch="@@ -0,0 +1 @@\n+test", + binary=False, + ), + ]), + }, + "dogstatd-linux-amd64-12345678+c7bcba44-db5fec1a", + ), + ( + { + "id": UUID("db5fec1a-7fce-42e9-a29b-13a8cc6a5493"), + "agent_components": {"core-agent", "system-probe", "dogstatsd"}, + "artifact_format": ArtifactFormat.OCI, + "compatible_platforms": { + Platform(OS.LINUX, Arch.AMD64), + Platform(OS.MACOS, Arch.ARM64), + Platform(OS.LINUX, Arch.ARM64), + Platform(OS.MACOS, Arch.AMD64), + }, + "build_platform": Platform(OS.MACOS, Arch.ARM64), + "build_time": datetime.now(UTC), + "digest": ArtifactDigest(value="sha256:" + "0" * 64, type=DigestType.OCI_DIGEST), + }, + "core,dogstatsd,system_probe-many-12345678-db5fec1a-oci.tar.gz", + ), + ], + ) + def test_get_canonical_filename(self, obj_data, example_commit, expected): + if "worktree_diff" not in obj_data: + obj_data["worktree_diff"] = ChangeSet({}) + obj = BuildMetadata(**obj_data, commit=example_commit) + assert obj.get_canonical_filename() == expected + + +class TestDigest: + def test_calculate_digest(self, app, mocker): + # Setup mocks + mocker.patch("dda.utils.fs.Path.hexdigest", return_value="0" * 64) + mocker.patch("dda.tools.docker.Docker.get_image_digest", return_value="sha256:" + "0" * 64) + + # Test + digest = DigestType.FILE_SHA256.calculate_digest(app, "test.txt") + assert digest.value == "0" * 64 + assert digest.type == DigestType.FILE_SHA256 + + digest = DigestType.OCI_DIGEST.calculate_digest(app, "test.txt") + assert digest.value == "sha256:" + "0" * 64 + assert digest.type == DigestType.OCI_DIGEST + + with pytest.raises(NotImplementedError): + DigestType.OTHER.calculate_digest(app, "test.txt") + + +class TestEncodeDecodeOtherValues: + """Test that we can encode and decode other values that are not part of the enum.""" + + def test_other_digest_type(self): + digest_type = DigestType.OTHER + encoded_digest_type = msgspec.json.encode(digest_type, enc_hook=enc_hook) + decoded_digest_type = msgspec.json.decode(encoded_digest_type, type=DigestType, dec_hook=dec_hook) + assert decoded_digest_type == DigestType.OTHER + + # Test that an unknown digest type is decoded as OTHER + decoded_digest = msgspec.json.decode(b'"i am a really weird digest"', type=DigestType, dec_hook=dec_hook) + assert decoded_digest == DigestType.OTHER + + def test_other_artifact_format(self): + artifact_format = ArtifactFormat.OTHER + encoded_artifact_format = msgspec.json.encode(artifact_format, enc_hook=enc_hook) + decoded_artifact_format = msgspec.json.decode(encoded_artifact_format, type=ArtifactFormat, dec_hook=dec_hook) + assert decoded_artifact_format == ArtifactFormat.OTHER + + # Test that an unknown artifact format is decoded as OTHER + decoded_artifact_format = msgspec.json.decode( + b'"i am a really weird artifact format"', type=ArtifactFormat, dec_hook=dec_hook + ) + assert decoded_artifact_format == ArtifactFormat.OTHER + + def test_other_artifact_type(self): + artifact_type = ArtifactType.OTHER + encoded_artifact_type = msgspec.json.encode(artifact_type, enc_hook=enc_hook) + decoded_artifact_type = msgspec.json.decode(encoded_artifact_type, type=ArtifactType, dec_hook=dec_hook) + assert decoded_artifact_type == ArtifactType.OTHER + + # Test that an unknown artifact type is decoded as OTHER + decoded_artifact_type = msgspec.json.decode( + b'"i am a really weird artifact type"', type=ArtifactType, dec_hook=dec_hook + ) + assert decoded_artifact_type == ArtifactType.OTHER diff --git a/tests/utils/git/test_changeset.py b/tests/utils/git/test_changeset.py index 1f3fd793..1774ccb6 100644 --- a/tests/utils/git/test_changeset.py +++ b/tests/utils/git/test_changeset.py @@ -56,6 +56,27 @@ def test_properties(self): assert changeset.deleted == {str(deleted.path): deleted} assert changeset.paths == {str(added.path): added, str(modified.path): modified, str(deleted.path): deleted} + def test_union(self): + changeset1 = ChangeSet([ + ChangedFile(path=Path("/path/to/file"), type=ChangeType.ADDED, binary=False, patch="patch") + ]) + changeset2 = ChangeSet([ + ChangedFile(path=Path("/path/to/file2"), type=ChangeType.DELETED, binary=True, patch="patch2") + ]) + assert changeset1 | changeset2 == ChangeSet([ + ChangedFile(path=Path("/path/to/file"), type=ChangeType.ADDED, binary=False, patch="patch"), + ChangedFile(path=Path("/path/to/file2"), type=ChangeType.DELETED, binary=True, patch="patch2"), + ]) + + def test_eq(self): + changeset1 = ChangeSet([ + ChangedFile(path=Path("/path/to/file"), type=ChangeType.ADDED, binary=False, patch="patch") + ]) + changeset2 = ChangeSet([ + ChangedFile(path=Path("/path/to/file"), type=ChangeType.ADDED, binary=False, patch="patch") + ]) + assert changeset1 == changeset2 + @pytest.mark.parametrize( "repo_testcase", REPO_TESTCASES, diff --git a/tests/utils/test_fs.py b/tests/utils/test_fs.py index 83751139..c47fe1de 100644 --- a/tests/utils/test_fs.py +++ b/tests/utils/test_fs.py @@ -10,7 +10,7 @@ import pytest from dda.types.hooks import dec_hook, enc_hook -from dda.utils.fs import Path, temp_directory +from dda.utils.fs import Path, temp_directory, temp_file class TestPath: @@ -66,6 +66,64 @@ def test_encode_decode_windows(self, path_str): decoded_path = msgspec.json.decode(encoded_path, type=Path, dec_hook=dec_hook) assert decoded_path == path + @pytest.mark.parametrize( + ("data", "hashes"), + [ + ( + b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. In aliquet felis in pretium aliquet. Ut ac laoreet quam. Vivamus velit.", + { + "sha256": "cfa2679987c09a6a650119bb403b7409097dde137fd5d6c5b73e1887ce2b1fa9", + "sha512": "51799af49d7e298ae8ecc9d3cb8494ce1798bc30b8ec00bd94ba9b4dcdf1cf44fb3adf2ff4046543a50b0c8d480813895712b53561803dd482037d670ee92dbb", + "blake2b": "4398d6c895645672abe343c4bcaa283adb7e35e534b1dc164a2426040ca74b4b285b5847f5fa86ce51aa84159a587cc5332cbeb8ceaa3b98c4f6fccdbc36df55", + }, + ), + ( + b"Etiam eget imperdiet enim, vel blandit mauris. Cras dapibus nam.", + { + "sha256": "ef7f1b043a2b91c2900ead0091f1ac715a8e97f5bf4508cff3738ac11b711f03", + "sha512": "21907a7f2c887c871a38919154916a81e2f3bb75dd26bbcde2d12c4c3d2c82eb83f94446196fcbdd35cad5bd9e17f3efafb2e2300949b0d93e18b5c54df1ab56", + "blake2b": "3815deec2d01e0a7ca59fb3c7782f668d10ff04b3d0c3973db1f934f18d92ec24c3fc2aead937736144f0e65d29440520de68ce75d88e258db72b1062edb825c", + }, + ), + ( + b"Phasellus congue commodo erat quis eleifend. Nulla semper velit eget mauris ultricies laoreet.\n Sed orci tellus, venenatis vitae egestas vel, vehicula eget odio.", + { + "sha256": "4ca6c0ca071e0cfdab3f6aeea99f852d6b9e2981409ffffa7a8f88b1e951c605", + "sha512": "c357c4eb2c8093774a807393fc19dcd980d8335c5d6e1d8b98bc1b8be2002a4c3f0d19b78f5d08290f8ec30f22951d3bff72c0ae8f0bfbe88429f9234ecb49d9", + "blake2b": "12edd771bf10e018bf31896985a4ce36941127d80d1e738ad712e0e4717c04deb0fa98f1dad02126e903f370019d9168041240722dc821386f5ff4efce36dd63", + }, + ), + ], + ) + def test_hexdigest(self, data, hashes): + algos = hashes.keys() + with temp_file(suffix=".txt") as path: + path.write_atomic(data, "wb") + for algo in algos: + # Try with the default buffer size + digest = path.hexdigest(algorithm=algo) + assert digest == hashes[algo] + + # Try with another buffer size + digest = path.hexdigest(algorithm=algo, buffer_size=32) + assert digest == hashes[algo] + + def test_hexdigest_invalid_call(self, temp_dir): + fake_algo = "not_a_real_algo" + with pytest.raises( + ValueError, + match=f"Invalid hashing algorithm `{fake_algo}` requested.", + ): + Path().hexdigest(algorithm=fake_algo) + + non_existing_path = Path("/non_existing_file.txt") + with pytest.raises(FileNotFoundError): + non_existing_path.hexdigest() + + non_file_path = temp_dir + with pytest.raises((IsADirectoryError, PermissionError)): + non_file_path.hexdigest() + def test_temp_directory(): with temp_directory() as temp_dir: