diff --git a/ci/ray_ci/automation/BUILD.bazel b/ci/ray_ci/automation/BUILD.bazel index 000fd14b9c8a..05db7fa08894 100644 --- a/ci/ray_ci/automation/BUILD.bazel +++ b/ci/ray_ci/automation/BUILD.bazel @@ -316,3 +316,29 @@ py_binary( ci_require("click"), ], ) + +py_binary( + name = "push_ray_image", + srcs = ["push_ray_image.py"], + exec_compatible_with = ["//bazel:py3"], + deps = [ + ":crane_lib", + "//ci/ray_ci:ray_ci_lib", + ci_require("click"), + ], +) + +py_test( + name = "test_push_ray_image", + size = "small", + srcs = ["test_push_ray_image.py"], + exec_compatible_with = ["//bazel:py3"], + tags = [ + "ci_unit", + "team:ci", + ], + deps = [ + ":push_ray_image", + ci_require("pytest"), + ], +) diff --git a/ci/ray_ci/automation/push_ray_image.py b/ci/ray_ci/automation/push_ray_image.py new file mode 100644 index 000000000000..a40e715992a5 --- /dev/null +++ b/ci/ray_ci/automation/push_ray_image.py @@ -0,0 +1,318 @@ +import logging +import sys +from datetime import datetime +from typing import List + +import click + +from ci.ray_ci.automation.crane_lib import ( + call_crane_copy, + call_crane_manifest, +) +from ci.ray_ci.configs import ( + ARCHITECTURE, + DEFAULT_ARCHITECTURE, + DEFAULT_PYTHON_TAG_VERSION, + PYTHON_VERSIONS, +) +from ci.ray_ci.docker_container import ( + ARCHITECTURES_RAY, + ARCHITECTURES_RAY_LLM, + ARCHITECTURES_RAY_ML, + GPU_PLATFORM, + PLATFORMS_RAY, + PLATFORMS_RAY_LLM, + PLATFORMS_RAY_ML, + PYTHON_VERSIONS_RAY, + PYTHON_VERSIONS_RAY_LLM, + PYTHON_VERSIONS_RAY_ML, + RAY_REPO_MAP, + RayType, +) +from ci.ray_ci.utils import ecr_docker_login + +VALID_IMAGE_TYPES = [rt.value for rt in RayType] + +logging.basicConfig( + level=logging.INFO, + format="%(message)s", + stream=sys.stdout, +) +logger = logging.getLogger(__name__) + + +class PushRayImageError(Exception): + """Error raised when pushing ray images fails.""" + + +def compact_cuda_suffix(platform: str) -> str: + """Convert a CUDA platform string to compact suffix (e.g. cu12.1.1-cudnn8 -> -cu121).""" + platform_base = platform.split("-", 1)[0] + parts = platform_base.split(".") + if len(parts) < 2: + raise PushRayImageError(f"Unrecognized GPU platform format: {platform}") + + return f"-{parts[0]}{parts[1]}" + + +class RayImagePushContext: + """Context for publishing a ray image from Wanda cache to Docker Hub.""" + + ray_type: RayType + python_version: str + platform: str + architecture: str + branch: str + commit: str + rayci_schedule: str + rayci_build_id: str + pull_request: str # buildkite uses "false" or number string + # Computed fields (set in __init__) + arch_suffix: str + wanda_tag: str + docker_hub_repo: str + + def __init__( + self, + ray_type: RayType, + python_version: str, + platform: str, + architecture: str, + branch: str, + commit: str, + rayci_schedule: str, + rayci_build_id: str, + pull_request: str, + ) -> None: + self.ray_type = ray_type + self.python_version = python_version + self.platform = platform + self.architecture = architecture + self.branch = branch + self.commit = commit + self.rayci_schedule = rayci_schedule + self.rayci_build_id = rayci_build_id + self.pull_request = pull_request + + arch_suffix = "" if architecture == DEFAULT_ARCHITECTURE else f"-{architecture}" + self.arch_suffix = arch_suffix + self.wanda_tag = f"{rayci_build_id}-{self.wanda_image_name()}" + self.docker_hub_repo = f"rayproject/{RAY_REPO_MAP[self.ray_type.value]}" + + def assert_published_image_type(self) -> None: + invalid_python_version = ( + f"Invalid python version {self.python_version} for {self.ray_type}" + ) + invalid_platform = f"Invalid platform {self.platform} for {self.ray_type}" + invalid_architecture = ( + f"Invalid architecture {self.architecture} for {self.ray_type}" + ) + + if self.ray_type in [RayType.RAY_ML, RayType.RAY_ML_EXTRA]: + if self.python_version not in PYTHON_VERSIONS_RAY_ML: + raise PushRayImageError(invalid_python_version) + if self.platform not in PLATFORMS_RAY_ML: + raise PushRayImageError(invalid_platform) + if self.architecture not in ARCHITECTURES_RAY_ML: + raise PushRayImageError(invalid_architecture) + elif self.ray_type in [RayType.RAY_LLM, RayType.RAY_LLM_EXTRA]: + if self.python_version not in PYTHON_VERSIONS_RAY_LLM: + raise PushRayImageError(invalid_python_version) + if self.platform not in PLATFORMS_RAY_LLM: + raise PushRayImageError(invalid_platform) + if self.architecture not in ARCHITECTURES_RAY_LLM: + raise PushRayImageError(invalid_architecture) + else: + # ray or ray-extra + if self.python_version not in PYTHON_VERSIONS_RAY: + raise PushRayImageError(invalid_python_version) + if self.platform not in PLATFORMS_RAY: + raise PushRayImageError(invalid_platform) + if self.architecture not in ARCHITECTURES_RAY: + raise PushRayImageError(invalid_architecture) + + def destination_tags(self) -> List[str]: + """ + Compute the destination tags for this context. + + Tags are formed as: + {version}{variation}{python_suffix}{platform}{architecture_suffix} + + For example: + - nightly.260107.abc123-py310-cpu + - nightly-extra-py310-cu121 + - nightly.260107.abc123-extra-py310-gpu + - 2.53.0.abc123-py310-cu121 + - 2.53.0.abc123-extra-py310-cu121 + """ + tags = [] + for version in self._versions(): + for plat in self._platform_suffixes(): + for py in self._python_suffixes(): + tags.append( + f"{version}{self._variation_suffix()}{py}{plat}{self.arch_suffix}" + ) + return tags + + def wanda_image_name(self) -> str: + """Get the wanda source image name for this context.""" + if self.platform == "cpu": + return ( + f"{self.ray_type.value}-py{self.python_version}-cpu{self.arch_suffix}" + ) + return f"{self.ray_type.value}-py{self.python_version}-{self.platform}{self.arch_suffix}" + + def _versions(self) -> List[str]: + """Compute version tags based on branch/schedule/PR status.""" + is_master = self.branch == "master" + is_nightly = self.rayci_schedule == "nightly" + is_pull_request = self.pull_request != "false" + is_release = self.branch and self.branch.startswith("releases/") + sha_tag = self.commit[:6] + formatted_date = datetime.now().strftime("%y%m%d") + + if is_master: + if is_nightly: + return [f"nightly.{formatted_date}.{sha_tag}", "nightly"] + return [sha_tag, self.rayci_build_id] + elif is_release: + release_name = self.branch[len("releases/") :] + return [f"{release_name}.{sha_tag}"] + elif is_pull_request: + return [f"pr-{self.pull_request}.{sha_tag}", self.rayci_build_id] + else: + return [sha_tag, self.rayci_build_id] + + def _variation_suffix(self) -> str: + """Get -extra suffix for extra image types.""" + if self.ray_type in { + RayType.RAY_EXTRA, + RayType.RAY_ML_EXTRA, + RayType.RAY_LLM_EXTRA, + }: + return "-extra" + return "" + + def _python_suffixes(self) -> List[str]: + """Get python version suffixes (includes empty for default version).""" + suffixes = [f"-py{self.python_version.replace('.', '')}"] + if self.python_version == DEFAULT_PYTHON_TAG_VERSION: + suffixes.append("") + return suffixes + + def _platform_suffixes(self) -> List[str]: + """Get platform suffixes (includes aliases like -gpu for GPU_PLATFORM).""" + if self.platform == "cpu": + suffixes = ["-cpu"] + # no tag is alias to cpu for ray image + if self.ray_type in {RayType.RAY, RayType.RAY_EXTRA}: + suffixes.append("") + return suffixes + + suffixes = [compact_cuda_suffix(self.platform)] + if self.platform == GPU_PLATFORM: + # gpu is alias to GPU_PLATFORM value for ray image + suffixes.append("-gpu") + # no tag is alias to gpu for ray-ml image + if self.ray_type in {RayType.RAY_ML, RayType.RAY_ML_EXTRA}: + suffixes.append("") + + return suffixes + + +def _image_exists(tag: str) -> bool: + """Check if a container image manifest exists using crane.""" + return_code, _ = call_crane_manifest(tag) + return return_code == 0 + + +def _copy_image(reference: str, destination: str, dry_run: bool = False) -> None: + """Copy a container image from source to destination using crane.""" + if dry_run: + logger.info(f"DRY RUN: Would copy {reference} -> {destination}") + return + + logger.info(f"Copying {reference} -> {destination}") + return_code, output = call_crane_copy(reference, destination) + if return_code != 0: + raise PushRayImageError(f"Crane copy failed: {output}") + logger.info(f"Successfully copied to {destination}") + + +@click.command() +@click.option( + "--python-version", type=click.Choice(list(PYTHON_VERSIONS.keys())), required=True +) +@click.option("--platform", type=click.Choice(list(PLATFORMS_RAY)), required=True) +@click.option( + "--image-type", + type=click.Choice(VALID_IMAGE_TYPES), + required=True, +) +@click.option("--architecture", type=click.Choice(ARCHITECTURE), required=True) +@click.option("--rayci-work-repo", type=str, required=True, envvar="RAYCI_WORK_REPO") +@click.option("--rayci-build-id", type=str, required=True, envvar="RAYCI_BUILD_ID") +@click.option("--branch", type=str, required=True, envvar="BUILDKITE_BRANCH") +@click.option("--commit", type=str, required=True, envvar="BUILDKITE_COMMIT") +@click.option("--rayci-schedule", type=str, default="", envvar="RAYCI_SCHEDULE") +@click.option( + "--pull-request", type=str, default="false", envvar="BUILDKITE_PULL_REQUEST" +) +@click.option("--upload", is_flag=True, default=False) +def main( + python_version: str, + platform: str, + image_type: str, + architecture: str, + rayci_work_repo: str, + rayci_build_id: str, + branch: str, + commit: str, + rayci_schedule: str, + pull_request: str, + upload: bool, +) -> None: + """ + Publish a Wanda-cached ray image to Docker Hub. + + Tags are generated matching the original RayDockerContainer format: + {version}{variation}{python_suffix}{platform}{architecture_suffix} + """ + dry_run = not upload + if dry_run: + logger.info("DRY RUN MODE - no images will be pushed") + + ctx = RayImagePushContext( + ray_type=RayType(image_type), + python_version=python_version, + platform=platform, + architecture=architecture, + branch=branch, + commit=commit, + rayci_schedule=rayci_schedule, + rayci_build_id=rayci_build_id, + pull_request=pull_request, + ) + + ctx.assert_published_image_type() + + ecr_registry = rayci_work_repo.split("/")[0] + ecr_docker_login(ecr_registry) + + src_ref = f"{rayci_work_repo}:{ctx.wanda_tag}" + logger.info(f"Verifying source image in Wanda cache: {src_ref}") + if not _image_exists(src_ref): + raise PushRayImageError(f"Source image not found in Wanda cache: {src_ref}") + + destination_tags = ctx.destination_tags() + for tag in destination_tags: + dest_ref = f"{ctx.docker_hub_repo}:{tag}" + _copy_image(src_ref, dest_ref, dry_run=dry_run) + + logger.info( + f"Successfully pushed {ctx.ray_type.value} image with tags: {destination_tags}" + ) + + +if __name__ == "__main__": + main() diff --git a/ci/ray_ci/automation/test_push_ray_image.py b/ci/ray_ci/automation/test_push_ray_image.py new file mode 100644 index 000000000000..50612bd60d61 --- /dev/null +++ b/ci/ray_ci/automation/test_push_ray_image.py @@ -0,0 +1,322 @@ +import sys +from unittest import mock + +import pytest + +from ci.ray_ci.automation.push_ray_image import RayImagePushContext, compact_cuda_suffix +from ci.ray_ci.configs import DEFAULT_ARCHITECTURE, DEFAULT_PYTHON_TAG_VERSION +from ci.ray_ci.docker_container import GPU_PLATFORM, RayType + + +def make_ctx(**overrides) -> RayImagePushContext: + """Create a RayImagePushContext with defaults for testing.""" + defaults = { + "ray_type": RayType.RAY, + "python_version": DEFAULT_PYTHON_TAG_VERSION, + "platform": "cpu", + "architecture": DEFAULT_ARCHITECTURE, + "branch": "master", + "commit": "abc123", + "rayci_schedule": "", + "rayci_build_id": "build123", + "pull_request": "false", + } + defaults.update(overrides) + + return RayImagePushContext(**defaults) + + +class TestWandaImageName: + DEFAULT_TEST_CUDA_PLATFORM = "cu12.1.1-cudnn8" + + @pytest.mark.parametrize( + ("ray_type", "python_version", "platform", "architecture", "expected"), + [ + # CPU images + (RayType.RAY, "3.10", "cpu", DEFAULT_ARCHITECTURE, "ray-py3.10-cpu"), + (RayType.RAY, "3.10", "cpu", "aarch64", "ray-py3.10-cpu-aarch64"), + ( + RayType.RAY_EXTRA, + "3.10", + "cpu", + DEFAULT_ARCHITECTURE, + "ray-extra-py3.10-cpu", + ), + # CUDA images + ( + RayType.RAY, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ( + RayType.RAY, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + "aarch64", + f"ray-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}-aarch64", + ), + ( + RayType.RAY_EXTRA, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-extra-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ( + RayType.RAY_LLM, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-llm-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ( + RayType.RAY_LLM_EXTRA, + "3.11", + DEFAULT_TEST_CUDA_PLATFORM, + DEFAULT_ARCHITECTURE, + f"ray-llm-extra-py3.11-{DEFAULT_TEST_CUDA_PLATFORM}", + ), + ], + ) + def test_wanda_image_name( + self, ray_type, python_version, platform, architecture, expected + ): + ctx = make_ctx( + ray_type=ray_type, + python_version=python_version, + platform=platform, + architecture=architecture, + ) + assert ctx.wanda_image_name() == expected + + +class TestVariationSuffix: + @pytest.mark.parametrize( + ("ray_type", "expected"), + [ + (RayType.RAY, ""), + (RayType.RAY_EXTRA, "-extra"), + (RayType.RAY_ML, ""), + (RayType.RAY_ML_EXTRA, "-extra"), + (RayType.RAY_LLM, ""), + (RayType.RAY_LLM_EXTRA, "-extra"), + ], + ) + def test_variation_suffix(self, ray_type, expected): + ctx = make_ctx(ray_type=ray_type) + assert ctx._variation_suffix() == expected + + +class TestPythonSuffixes: + @pytest.mark.parametrize( + ("python_version", "expected"), + [ + ( + DEFAULT_PYTHON_TAG_VERSION, + ["-py" + DEFAULT_PYTHON_TAG_VERSION.replace(".", ""), ""], + ), # default gets empty suffix too + ("3.99", ["-py399"]), # non-default gets no empty suffix + ], + ) + def test_python_suffixes(self, python_version, expected): + ctx = make_ctx(python_version=python_version) + assert ctx._python_suffixes() == expected + + +class TestPlatformSuffixes: + @pytest.mark.parametrize( + ("platform", "ray_type", "expected"), + [ + # CPU images + ("cpu", RayType.RAY, ["-cpu", ""]), + ("cpu", RayType.RAY_EXTRA, ["-cpu", ""]), + ("cpu", RayType.RAY_ML, ["-cpu"]), # ray-ml doesn't get empty for cpu + # CUDA images + ("cu11.7.1-cudnn8", RayType.RAY, ["-cu117"]), + ("cu11.8.0-cudnn8", RayType.RAY, ["-cu118"]), + (GPU_PLATFORM, RayType.RAY, [compact_cuda_suffix(GPU_PLATFORM), "-gpu"]), + ( + GPU_PLATFORM, + RayType.RAY_ML, + [compact_cuda_suffix(GPU_PLATFORM), "-gpu", ""], + ), # ray-ml gets empty for GPU_PLATFORM + ], + ) + def test_platform_suffixes(self, platform, ray_type, expected): + ctx = make_ctx(platform=platform, ray_type=ray_type) + assert ctx._platform_suffixes() == expected + + +class TestVersions: + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_master(self, mock_datetime): + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx(branch="master", commit="abc123def456", rayci_schedule="nightly") + assert ctx._versions() == ["nightly.260107.abc123", "nightly"] + + def test_release_branch(self): + ctx = make_ctx(branch="releases/2.44.0", commit="abc123def456") + assert ctx._versions() == ["2.44.0.abc123"] + + def test_pull_request(self): + ctx = make_ctx( + branch="feature-branch", commit="abc123def456", pull_request="12345" + ) + assert ctx._versions() == ["pr-12345.abc123", "build123"] + + def test_other_branch(self): + ctx = make_ctx(branch="feature-branch", commit="abc123def456") + assert ctx._versions() == ["abc123", "build123"] + + def test_master_non_nightly(self): + """Master branch without nightly schedule returns sha tags, not PR tags.""" + ctx = make_ctx( + branch="master", + commit="abc123def456", + rayci_schedule="", + pull_request="123", + ) + # Even with pull_request set, master branch should return sha tags + assert ctx._versions() == ["abc123", "build123"] + + +class TestDestinationTags: + """ + Test destination_tags method. + + Tags are formed as: {version}{variation}{python_suffix}{platform}{architecture_suffix} + """ + + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_cpu_default_python(self, mock_datetime): + """Test: nightly.260107.abc123-py310-cpu""" + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx(branch="master", commit="abc123def456", rayci_schedule="nightly") + tags = ctx.destination_tags() + # nightly versions x cpu suffixes x python suffixes + # ["nightly.260107.abc123", "nightly"] x ["-cpu", ""] x ["-py310", ""] + assert "nightly.260107.abc123-py310-cpu" in tags + assert "nightly.260107.abc123-cpu" in tags + assert "nightly.260107.abc123-py310" in tags + assert "nightly.260107.abc123" in tags + assert "nightly-py310-cpu" in tags + assert "nightly-cpu" in tags + assert "nightly-py310" in tags + assert "nightly" in tags + + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_extra_gpu(self, mock_datetime): + """Test: nightly-extra-py310-cu121 and nightly.260107.abc123-extra-py310-gpu""" + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx( + ray_type=RayType.RAY_EXTRA, + platform=GPU_PLATFORM, + branch="master", + commit="abc123def456", + rayci_schedule="nightly", + ) + tags = ctx.destination_tags() + # Should include -extra variation and -gpu alias + assert "nightly.260107.abc123-extra-py310-cu121" in tags + assert "nightly.260107.abc123-extra-py310-gpu" in tags + assert "nightly-extra-py310-cu121" in tags + assert "nightly-extra-py310-gpu" in tags + assert "nightly.260107.abc123-extra-cu121" in tags + assert "nightly-extra-gpu" in tags + + @mock.patch("ci.ray_ci.automation.push_ray_image.datetime") + def test_nightly_gpu_platform_non_default_python(self, mock_datetime): + """Test: nightly.260107.abc123-py311-cu121""" + mock_datetime.now.return_value.strftime.return_value = "260107" + ctx = make_ctx( + python_version="3.11", + platform=GPU_PLATFORM, + branch="master", + commit="abc123def456", + rayci_schedule="nightly", + ) + tags = ctx.destination_tags() + # Should include -cu121, -gpu aliases but NOT empty python suffix (3.11 is not default) + assert "nightly.260107.abc123-py311-cu121" in tags + assert "nightly.260107.abc123-py311-gpu" in tags + assert "nightly-py311-cu121" in tags + assert "nightly-py311-gpu" in tags + # Should NOT have empty python suffix variants + assert "nightly.260107.abc123-cu121" not in tags + assert "nightly-gpu" not in tags + + def test_release_gpu(self): + """Test: 2.53.0.abc123-py310-cu121""" + ctx = make_ctx( + platform=GPU_PLATFORM, branch="releases/2.53.0", commit="abc123def456" + ) + tags = ctx.destination_tags() + assert "2.53.0.abc123-py310-cu121" in tags + assert "2.53.0.abc123-py310-gpu" in tags + # Default python suffix variants + assert "2.53.0.abc123-cu121" in tags + assert "2.53.0.abc123-gpu" in tags + + def test_release_extra_gpu(self): + """Test: 2.53.0.abc123-extra-py310-cu121""" + ctx = make_ctx( + ray_type=RayType.RAY_EXTRA, + platform=GPU_PLATFORM, + branch="releases/2.53.0", + commit="abc123def456", + ) + tags = ctx.destination_tags() + assert "2.53.0.abc123-extra-py310-cu121" in tags + assert "2.53.0.abc123-extra-py310-gpu" in tags + # Default python suffix variants + assert "2.53.0.abc123-extra-cu121" in tags + assert "2.53.0.abc123-extra-gpu" in tags + + def test_release_non_gpu_platform_cuda(self): + """Test release with non-GPU_PLATFORM CUDA version (no -gpu alias).""" + ctx = make_ctx( + python_version="3.11", + platform="cu12.3.2-cudnn9", # Not GPU_PLATFORM + branch="releases/2.44.0", + commit="abc123def456", + ) + tags = ctx.destination_tags() + assert "2.44.0.abc123-py311-cu123" in tags + # Should NOT have -gpu alias since this isn't GPU_PLATFORM + assert "2.44.0.abc123-py311-gpu" not in tags + + def test_release_cpu_aarch64(self): + """Test release with architecture suffix.""" + ctx = make_ctx( + architecture="aarch64", + branch="releases/2.44.0", + commit="abc123def456", + ) + tags = ctx.destination_tags() + assert "2.44.0.abc123-py310-cpu-aarch64" in tags + assert "2.44.0.abc123-cpu-aarch64" in tags + # Empty platform suffix variant (ray cpu alias) + assert "2.44.0.abc123-py310-aarch64" in tags + assert "2.44.0.abc123-aarch64" in tags + + def test_pull_request_tags(self): + """Test PR builds include pr-{number} prefix.""" + ctx = make_ctx( + branch="feature-branch", commit="abc123def456", pull_request="12345" + ) + tags = ctx.destination_tags() + assert "pr-12345.abc123-py310-cpu" in tags + assert "build123-py310-cpu" in tags + + def test_feature_branch_non_pr(self): + """Test non-PR feature branch uses sha and build_id.""" + ctx = make_ctx(python_version="3.12", commit="abc123def456") + tags = ctx.destination_tags() + assert "abc123-py312-cpu" in tags + assert "build123-py312-cpu" in tags + + +if __name__ == "__main__": + sys.exit(pytest.main(["-vv", __file__]))