Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions ci/ray_ci/automation/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -316,3 +316,29 @@ py_binary(
ci_require("click"),
],
)

py_binary(
name = "push_ray_image",
srcs = ["push_ray_image.py"],
exec_compatible_with = ["//bazel:py3"],
deps = [
":crane_lib",
"//ci/ray_ci:ray_ci_lib",
ci_require("click"),
],
)

py_test(
name = "test_push_ray_image",
size = "small",
srcs = ["test_push_ray_image.py"],
exec_compatible_with = ["//bazel:py3"],
tags = [
"ci_unit",
"team:ci",
],
deps = [
":push_ray_image",
ci_require("pytest"),
],
)
318 changes: 318 additions & 0 deletions ci/ray_ci/automation/push_ray_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
import logging
import sys
from datetime import datetime
from typing import List

import click

from ci.ray_ci.automation.crane_lib import (
call_crane_copy,
call_crane_manifest,
)
from ci.ray_ci.configs import (
ARCHITECTURE,
DEFAULT_ARCHITECTURE,
DEFAULT_PYTHON_TAG_VERSION,
PYTHON_VERSIONS,
)
from ci.ray_ci.docker_container import (
ARCHITECTURES_RAY,
ARCHITECTURES_RAY_LLM,
ARCHITECTURES_RAY_ML,
GPU_PLATFORM,
PLATFORMS_RAY,
PLATFORMS_RAY_LLM,
PLATFORMS_RAY_ML,
PYTHON_VERSIONS_RAY,
PYTHON_VERSIONS_RAY_LLM,
PYTHON_VERSIONS_RAY_ML,
RAY_REPO_MAP,
RayType,
)
from ci.ray_ci.utils import ecr_docker_login

VALID_IMAGE_TYPES = [rt.value for rt in RayType]

logging.basicConfig(
level=logging.INFO,
format="%(message)s",
stream=sys.stdout,
)
logger = logging.getLogger(__name__)


class PushRayImageError(Exception):
"""Error raised when pushing ray images fails."""


def compact_cuda_suffix(platform: str) -> str:
"""Convert a CUDA platform string to compact suffix (e.g. cu12.1.1-cudnn8 -> -cu121)."""
platform_base = platform.split("-", 1)[0]
parts = platform_base.split(".")
if len(parts) < 2:
raise PushRayImageError(f"Unrecognized GPU platform format: {platform}")

return f"-{parts[0]}{parts[1]}"


class RayImagePushContext:
"""Context for publishing a ray image from Wanda cache to Docker Hub."""

ray_type: RayType
python_version: str
platform: str
architecture: str
branch: str
commit: str
rayci_schedule: str
rayci_build_id: str
pull_request: str # buildkite uses "false" or number string
# Computed fields (set in __init__)
arch_suffix: str
wanda_tag: str
docker_hub_repo: str

def __init__(
self,
ray_type: RayType,
python_version: str,
platform: str,
architecture: str,
branch: str,
commit: str,
rayci_schedule: str,
rayci_build_id: str,
pull_request: str,
) -> None:
self.ray_type = ray_type
self.python_version = python_version
self.platform = platform
self.architecture = architecture
self.branch = branch
self.commit = commit
self.rayci_schedule = rayci_schedule
self.rayci_build_id = rayci_build_id
self.pull_request = pull_request

arch_suffix = "" if architecture == DEFAULT_ARCHITECTURE else f"-{architecture}"
self.arch_suffix = arch_suffix
self.wanda_tag = f"{rayci_build_id}-{self.wanda_image_name()}"
self.docker_hub_repo = f"rayproject/{RAY_REPO_MAP[self.ray_type.value]}"

def assert_published_image_type(self) -> None:
invalid_python_version = (
f"Invalid python version {self.python_version} for {self.ray_type}"
)
invalid_platform = f"Invalid platform {self.platform} for {self.ray_type}"
invalid_architecture = (
f"Invalid architecture {self.architecture} for {self.ray_type}"
)

if self.ray_type in [RayType.RAY_ML, RayType.RAY_ML_EXTRA]:
if self.python_version not in PYTHON_VERSIONS_RAY_ML:
raise PushRayImageError(invalid_python_version)
if self.platform not in PLATFORMS_RAY_ML:
raise PushRayImageError(invalid_platform)
if self.architecture not in ARCHITECTURES_RAY_ML:
raise PushRayImageError(invalid_architecture)
elif self.ray_type in [RayType.RAY_LLM, RayType.RAY_LLM_EXTRA]:
if self.python_version not in PYTHON_VERSIONS_RAY_LLM:
raise PushRayImageError(invalid_python_version)
if self.platform not in PLATFORMS_RAY_LLM:
raise PushRayImageError(invalid_platform)
if self.architecture not in ARCHITECTURES_RAY_LLM:
raise PushRayImageError(invalid_architecture)
else:
# ray or ray-extra
if self.python_version not in PYTHON_VERSIONS_RAY:
raise PushRayImageError(invalid_python_version)
if self.platform not in PLATFORMS_RAY:
raise PushRayImageError(invalid_platform)
if self.architecture not in ARCHITECTURES_RAY:
raise PushRayImageError(invalid_architecture)

def destination_tags(self) -> List[str]:
"""
Compute the destination tags for this context.

Tags are formed as:
{version}{variation}{python_suffix}{platform}{architecture_suffix}

For example:
- nightly.260107.abc123-py310-cpu
- nightly-extra-py310-cu121
- nightly.260107.abc123-extra-py310-gpu
- 2.53.0.abc123-py310-cu121
- 2.53.0.abc123-extra-py310-cu121
"""
tags = []
for version in self._versions():
for plat in self._platform_suffixes():
for py in self._python_suffixes():
tags.append(
f"{version}{self._variation_suffix()}{py}{plat}{self.arch_suffix}"
)
return tags

def wanda_image_name(self) -> str:
"""Get the wanda source image name for this context."""
if self.platform == "cpu":
return (
f"{self.ray_type.value}-py{self.python_version}-cpu{self.arch_suffix}"
)
return f"{self.ray_type.value}-py{self.python_version}-{self.platform}{self.arch_suffix}"

def _versions(self) -> List[str]:
"""Compute version tags based on branch/schedule/PR status."""
is_master = self.branch == "master"
is_nightly = self.rayci_schedule == "nightly"
is_pull_request = self.pull_request != "false"
is_release = self.branch and self.branch.startswith("releases/")
sha_tag = self.commit[:6]
formatted_date = datetime.now().strftime("%y%m%d")

if is_master:
if is_nightly:
return [f"nightly.{formatted_date}.{sha_tag}", "nightly"]
return [sha_tag, self.rayci_build_id]
elif is_release:
release_name = self.branch[len("releases/") :]
return [f"{release_name}.{sha_tag}"]
elif is_pull_request:
return [f"pr-{self.pull_request}.{sha_tag}", self.rayci_build_id]
else:
return [sha_tag, self.rayci_build_id]

def _variation_suffix(self) -> str:
"""Get -extra suffix for extra image types."""
if self.ray_type in {
RayType.RAY_EXTRA,
RayType.RAY_ML_EXTRA,
RayType.RAY_LLM_EXTRA,
}:
return "-extra"
return ""

def _python_suffixes(self) -> List[str]:
"""Get python version suffixes (includes empty for default version)."""
suffixes = [f"-py{self.python_version.replace('.', '')}"]
if self.python_version == DEFAULT_PYTHON_TAG_VERSION:
suffixes.append("")
return suffixes

def _platform_suffixes(self) -> List[str]:
"""Get platform suffixes (includes aliases like -gpu for GPU_PLATFORM)."""
if self.platform == "cpu":
suffixes = ["-cpu"]
# no tag is alias to cpu for ray image
if self.ray_type in {RayType.RAY, RayType.RAY_EXTRA}:
suffixes.append("")
return suffixes

suffixes = [compact_cuda_suffix(self.platform)]
if self.platform == GPU_PLATFORM:
# gpu is alias to GPU_PLATFORM value for ray image
suffixes.append("-gpu")
# no tag is alias to gpu for ray-ml image
if self.ray_type in {RayType.RAY_ML, RayType.RAY_ML_EXTRA}:
suffixes.append("")

return suffixes


def _image_exists(tag: str) -> bool:
"""Check if a container image manifest exists using crane."""
return_code, _ = call_crane_manifest(tag)
return return_code == 0


def _copy_image(reference: str, destination: str, dry_run: bool = False) -> None:
"""Copy a container image from source to destination using crane."""
if dry_run:
logger.info(f"DRY RUN: Would copy {reference} -> {destination}")
return

logger.info(f"Copying {reference} -> {destination}")
return_code, output = call_crane_copy(reference, destination)
if return_code != 0:
raise PushRayImageError(f"Crane copy failed: {output}")
logger.info(f"Successfully copied to {destination}")


@click.command()
@click.option(
"--python-version", type=click.Choice(list(PYTHON_VERSIONS.keys())), required=True
)
@click.option("--platform", type=click.Choice(list(PLATFORMS_RAY)), required=True)
@click.option(
"--image-type",
type=click.Choice(VALID_IMAGE_TYPES),
required=True,
)
@click.option("--architecture", type=click.Choice(ARCHITECTURE), required=True)
@click.option("--rayci-work-repo", type=str, required=True, envvar="RAYCI_WORK_REPO")
@click.option("--rayci-build-id", type=str, required=True, envvar="RAYCI_BUILD_ID")
@click.option("--branch", type=str, required=True, envvar="BUILDKITE_BRANCH")
@click.option("--commit", type=str, required=True, envvar="BUILDKITE_COMMIT")
@click.option("--rayci-schedule", type=str, default="", envvar="RAYCI_SCHEDULE")
@click.option(
"--pull-request", type=str, default="false", envvar="BUILDKITE_PULL_REQUEST"
)
@click.option("--upload", is_flag=True, default=False)
def main(
python_version: str,
platform: str,
image_type: str,
architecture: str,
rayci_work_repo: str,
rayci_build_id: str,
branch: str,
commit: str,
rayci_schedule: str,
pull_request: str,
upload: bool,
) -> None:
"""
Publish a Wanda-cached ray image to Docker Hub.

Tags are generated matching the original RayDockerContainer format:
{version}{variation}{python_suffix}{platform}{architecture_suffix}
"""
dry_run = not upload
if dry_run:
logger.info("DRY RUN MODE - no images will be pushed")

ctx = RayImagePushContext(
ray_type=RayType(image_type),
python_version=python_version,
platform=platform,
architecture=architecture,
branch=branch,
commit=commit,
rayci_schedule=rayci_schedule,
rayci_build_id=rayci_build_id,
pull_request=pull_request,
)

ctx.assert_published_image_type()

ecr_registry = rayci_work_repo.split("/")[0]
ecr_docker_login(ecr_registry)

src_ref = f"{rayci_work_repo}:{ctx.wanda_tag}"
logger.info(f"Verifying source image in Wanda cache: {src_ref}")
if not _image_exists(src_ref):
raise PushRayImageError(f"Source image not found in Wanda cache: {src_ref}")

destination_tags = ctx.destination_tags()
for tag in destination_tags:
dest_ref = f"{ctx.docker_hub_repo}:{tag}"
_copy_image(src_ref, dest_ref, dry_run=dry_run)

logger.info(
f"Successfully pushed {ctx.ray_type.value} image with tags: {destination_tags}"
)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Misleading "Successfully pushed" log in dry run mode

Low Severity

The final log message states "Successfully pushed" even when dry_run=True (i.e., upload=False). In dry run mode, _copy_image only logs what would happen and returns without actually copying any images. The summary message at line 312-314 doesn't account for the dry run state, giving users incorrect feedback that images were pushed when they weren't.

Fix in Cursor Fix in Web



if __name__ == "__main__":
main()
Loading