From 92079a658845ffb32dc928406f73632032191af9 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Wed, 21 Jan 2026 06:46:53 +0000 Subject: [PATCH 01/25] init refactor --- rock/admin/entrypoints/sandbox_api.py | 48 +--- rock/sandbox/base_manager.py | 1 + rock/sandbox/sandbox_manager.py | 272 ++++-------------- rock/sandbox/service/deployment_service.py | 177 ++++++++++++ tests/unit/conftest.py | 5 + .../service/test_deployment_service.py | 10 + tests/unit/sandbox/test_sandbox_manager.py | 36 +-- 7 files changed, 261 insertions(+), 288 deletions(-) create mode 100644 rock/sandbox/service/deployment_service.py create mode 100644 tests/unit/sandbox/service/test_deployment_service.py diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py index e64fb7d22..b373e6210 100644 --- a/rock/admin/entrypoints/sandbox_api.py +++ b/rock/admin/entrypoints/sandbox_api.py @@ -52,7 +52,7 @@ async def start_async( x_experiment_id: str | None = Header(default="default", alias="X-Experiment-Id"), rock_authorization: str | None = Header(default="default", alias="X-Key"), ) -> RockResponse[SandboxStartResponse]: - sandbox_start_response = await sandbox_manager.start_async( + sandbox_start_response = await sandbox_manager.submit( DockerDeploymentConfig.from_request(request), user_info={ "user_id": x_user_id, @@ -91,52 +91,6 @@ async def get_status(sandbox_id: str): return RockResponse(result=await sandbox_manager.get_status(sandbox_id)) -@sandbox_router.post("/execute") -@handle_exceptions(error_message="execute command failed") -async def execute(command: SandboxCommand) -> RockResponse[CommandResponse]: - return RockResponse(result=await sandbox_manager.execute(command)) - - -@sandbox_router.post("/create_session") -@handle_exceptions(error_message="create session failed") -async def create_session(request: SandboxCreateBashSessionRequest) -> RockResponse[CreateBashSessionResponse]: - return RockResponse(result=await sandbox_manager.create_session(request)) - - -@sandbox_router.post("/run_in_session") -@handle_exceptions(error_message="run in session failed") -async def run(action: SandboxBashAction) -> RockResponse[BashObservation]: - return RockResponse(result=await sandbox_manager.run_in_session(action)) - - -@sandbox_router.post("/close_session") -@handle_exceptions(error_message="close session failed") -async def close_session(request: SandboxCloseBashSessionRequest) -> RockResponse[CloseBashSessionResponse]: - return RockResponse(result=await sandbox_manager.close_session(request)) - - -@sandbox_router.post("/read_file") -@handle_exceptions(error_message="read file failed") -async def read_file(request: SandboxReadFileRequest) -> RockResponse[ReadFileResponse]: - return RockResponse(result=await sandbox_manager.read_file(request)) - - -@sandbox_router.post("/write_file") -@handle_exceptions(error_message="write file failed") -async def write_file(request: SandboxWriteFileRequest) -> RockResponse[WriteFileResponse]: - return RockResponse(result=await sandbox_manager.write_file(request)) - - -@sandbox_router.post("/upload") -@handle_exceptions(error_message="upload file failed") -async def upload( - file: UploadFile = File(...), - target_path: str = Form(...), - sandbox_id: str | None = Form(None), -) -> RockResponse[UploadResponse]: - return RockResponse(result=await sandbox_manager.upload(file, target_path, sandbox_id)) - - @sandbox_router.post("/stop") @handle_exceptions(error_message="stop sandbox failed") async def close(sandbox_id: str = Body(..., embed=True)) -> RockResponse[str]: diff --git a/rock/sandbox/base_manager.py b/rock/sandbox/base_manager.py index 45ce81e76..497335719 100644 --- a/rock/sandbox/base_manager.py +++ b/rock/sandbox/base_manager.py @@ -116,6 +116,7 @@ async def _report_system_resource_metrics(self): self.metrics_monitor.record_gauge_by_name(MetricsConstants.AVAILABLE_CPU_RESOURCE, available_cpu) self.metrics_monitor.record_gauge_by_name(MetricsConstants.AVAILABLE_MEM_RESOURCE, available_mem) + # TODO: remove ray dependency in base manager, impl it in ray deployment service async def _collect_system_resource_metrics(self): """收集系统资源指标""" cluster_resources = ray.cluster_resources() diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 52bed6a47..bc418641f 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -1,41 +1,26 @@ import asyncio -import json import time -import ray -from fastapi import UploadFile - from rock import env_vars from rock.actions import ( - BashObservation, - CloseBashSessionResponse, CommandResponse, - CreateBashSessionResponse, - ReadFileResponse, - UploadResponse, - WriteFileResponse, ) from rock.actions.sandbox.response import IsAliveResponse, State + from rock.actions.sandbox.sandbox_info import SandboxInfo from rock.admin.core.ray_service import RayService from rock.admin.core.redis_key import ALIVE_PREFIX, alive_sandbox_key, timeout_sandbox_key from rock.admin.metrics.decorator import monitor_sandbox_operation -from rock.admin.proto.request import SandboxAction as Action -from rock.admin.proto.request import SandboxCloseBashSessionRequest as CloseBashSessionRequest -from rock.admin.proto.request import SandboxCommand as Command -from rock.admin.proto.request import SandboxCreateSessionRequest as CreateSessionRequest -from rock.admin.proto.request import SandboxReadFileRequest as ReadFileRequest -from rock.admin.proto.request import SandboxWriteFileRequest as WriteFileRequest from rock.admin.proto.response import SandboxStartResponse, SandboxStatusResponse from rock.config import RockConfig, RuntimeConfig from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig from rock.deployments.constants import Port from rock.deployments.status import PersistedServiceStatus, ServiceStatus +from rock.deployments.abstract import AbstractDeployment +from rock.deployments.config import DeploymentConfig from rock.logger import init_logger -from rock.rocklet import __version__ as swe_version -from rock.sandbox import __version__ as gateway_version from rock.sandbox.base_manager import BaseManager -from rock.sandbox.sandbox_actor import SandboxActor +from rock.sandbox.service.deployment_service import AbstractDeploymentService, RayDeploymentService from rock.sdk.common.exceptions import BadRequestRockError from rock.utils import ( EAGLE_EYE_TRACE_ID, @@ -51,6 +36,7 @@ class SandboxManager(BaseManager): _ray_namespace: str = None + _deployment_service: AbstractDeploymentService = None def __init__( self, @@ -65,125 +51,48 @@ def __init__( ) self._ray_service = ray_service self._ray_namespace = ray_namespace + self._deployment_service = RayDeploymentService(ray_namespace=ray_namespace) logger.info("sandbox service init success") - async def async_ray_get(self, ray_future: ray.ObjectRef): - self._ray_service.increment_ray_request_count() - loop = asyncio.get_running_loop() - try: - result = await loop.run_in_executor(self._executor, lambda r: ray.get(r, timeout=60), ray_future) - except Exception as e: - logger.error("ray get failed", exc_info=e) - error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get failed, {str(e)}" - raise Exception(error_msg) - return result - - async def async_ray_get_actor(self, sandbox_id: str): - self._ray_service.increment_ray_request_count() - loop = asyncio.get_running_loop() - try: - result = await loop.run_in_executor( - self._executor, ray.get_actor, self.deployment_manager.get_actor_name(sandbox_id), self._ray_namespace - ) - except ValueError as e: - logger.error(f"ray get actor, actor {sandbox_id} not exist", exc_info=e) - raise e - except Exception as e: - logger.error("ray get actor failed", exc_info=e) - error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get actor failed, {str(e)}" - raise Exception(error_msg) - return result - - async def _check_sandbox_exists_in_redis(self, config: DeploymentConfig): - if isinstance(config, DockerDeploymentConfig) and config.container_name: - sandbox_id = config.container_name - if self._redis_provider and await self._redis_provider.json_get(alive_sandbox_key(sandbox_id), "$"): - raise BadRequestRockError(f"Sandbox {sandbox_id} already exists") - @monitor_sandbox_operation() async def start_async(self, config: DeploymentConfig, user_info: dict = {}) -> SandboxStartResponse: - async with self._ray_service.get_ray_rwlock().read_lock(): - await self._check_sandbox_exists_in_redis(config) - docker_deployment_config: DockerDeploymentConfig = await self.deployment_manager.init_config(config) - sandbox_id = docker_deployment_config.container_name - logger.info(f"[{sandbox_id}] start_async params:{json.dumps(docker_deployment_config.model_dump(), indent=2)}") - actor_name = self.deployment_manager.get_actor_name(sandbox_id) - - deployment = docker_deployment_config.get_deployment() + return await self.submit(config, user_info) + @monitor_sandbox_operation() + async def submit(self, config: DeploymentConfig, user_info: dict = {}): + async with self._ray_service.get_ray_rwlock().read_lock(): + deployment_config: DeploymentConfig = await self.deployment_manager.init_config(config) + sandbox_id = deployment_config.container_name + # deployment: AbstractDeployment = deployment_config.get_deployment() self.validate_sandbox_spec(self.rock_config.runtime, config) - sandbox_actor: SandboxActor = await deployment.creator_actor(actor_name) - user_id = user_info.get("user_id", "default") - experiment_id = user_info.get("experiment_id", "default") - namespace = user_info.get("namespace", "default") - rock_authorization = user_info.get("rock_authorization", "default") - sandbox_actor.start.remote() - sandbox_actor.set_user_id.remote(user_id) - sandbox_actor.set_experiment_id.remote(experiment_id) - sandbox_actor.set_namespace.remote(namespace) - - self._sandbox_meta[sandbox_id] = {"image": docker_deployment_config.image} + self._sandbox_meta[sandbox_id] = {"image": deployment_config.image} + sandbox_info: SandboxInfo = await self._deployment_service.submit(deployment_config, user_info) logger.info(f"sandbox {sandbox_id} is submitted") - stop_time = str(int(time.time()) + docker_deployment_config.auto_clear_time * 60) + + stop_time = str(int(time.time()) + deployment_config.auto_clear_time * 60) auto_clear_time_dict = { - env_vars.ROCK_SANDBOX_AUTO_CLEAR_TIME_KEY: str(docker_deployment_config.auto_clear_time), + env_vars.ROCK_SANDBOX_AUTO_CLEAR_TIME_KEY: str(deployment_config.auto_clear_time), env_vars.ROCK_SANDBOX_EXPIRE_TIME_KEY: stop_time, } - sandbox_info: SandboxInfo = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) - sandbox_info["user_id"] = user_id - sandbox_info["experiment_id"] = experiment_id - sandbox_info["namespace"] = namespace - sandbox_info["state"] = State.PENDING - sandbox_info["rock_authorization"] = rock_authorization if self._redis_provider: await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) await self._redis_provider.json_set(timeout_sandbox_key(sandbox_id), "$", auto_clear_time_dict) + return SandboxStartResponse( sandbox_id=sandbox_id, host_name=sandbox_info.get("host_name"), host_ip=sandbox_info.get("host_ip"), ) - @monitor_sandbox_operation() - async def start(self, config: DeploymentConfig) -> SandboxStartResponse: - docker_deployment_config: DockerDeploymentConfig = await self.deployment_manager.init_config(config) - - sandbox_id = docker_deployment_config.container_name - actor_name = self.deployment_manager.get_actor_name(sandbox_id) - deployment = docker_deployment_config.get_deployment() - - sandbox_actor: SandboxActor = await deployment.creator_actor(actor_name) - - await self.async_ray_get(sandbox_actor.start.remote()) - logger.info(f"sandbox {sandbox_id} is started") - - while not await self._is_actor_alive(sandbox_id): - logger.debug(f"wait actor for sandbox alive, sandbox_id: {sandbox_id}") - # TODO: timeout check - await asyncio.sleep(1) - await self.get_status(sandbox_id) - - self._sandbox_meta[sandbox_id] = {"image": docker_deployment_config.image} - - return SandboxStartResponse( - sandbox_id=sandbox_id, - host_name=await self.async_ray_get(sandbox_actor.host_name.remote()), - host_ip=await self.async_ray_get(sandbox_actor.host_ip.remote()), - ) - @monitor_sandbox_operation() async def stop(self, sandbox_id): async with self._ray_service.get_ray_rwlock().read_lock(): logger.info(f"stop sandbox {sandbox_id}") try: - sandbox_actor = await self.async_ray_get_actor(sandbox_id) + await self._deployment_service.stop(sandbox_id) except ValueError as e: + logger.error(f"ray get actor, actor {sandbox_id} not exist", exc_info=e) await self._clear_redis_keys(sandbox_id) - raise Exception(f"sandbox {sandbox_id} not found to stop, {str(e)}") - logger.info(f"start to stop run time {sandbox_id}") - await self.async_ray_get(sandbox_actor.stop.remote()) - logger.info(f"run time stop over {sandbox_id}") - ray.kill(sandbox_actor) try: self._sandbox_meta.pop(sandbox_id) except KeyError: @@ -193,24 +102,18 @@ async def stop(self, sandbox_id): async def get_mount(self, sandbox_id): async with self._ray_service.get_ray_rwlock().read_lock(): - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - await self._clear_redis_keys(sandbox_id) - raise Exception(f"sandbox {sandbox_id} not found to get mount") - result = await self.async_ray_get(sandbox_actor.get_mount.remote()) - logger.info(f"get_mount: {result}") - return result + return self._deployment_service.get_mount(sandbox_id) @monitor_sandbox_operation() async def commit(self, sandbox_id, image_tag: str, username: str, password: str) -> CommandResponse: async with self._ray_service.get_ray_rwlock().read_lock(): logger.info(f"commit sandbox {sandbox_id}") - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: + deployment: AbstractDeployment = await self._deployment_service.get_deployment(sandbox_id) + if deployment is None: await self._clear_redis_keys(sandbox_id) raise Exception(f"sandbox {sandbox_id} not found to commit") logger.info(f"begin to commit {sandbox_id} to {image_tag}") - result = await self.async_ray_get(sandbox_actor.commit.remote(image_tag, username, password)) + result = await deployment.commit(image_tag, username, password) logger.info(f"commit {sandbox_id} to {image_tag} finished, result {result}") return result @@ -223,44 +126,13 @@ async def _clear_redis_keys(self, sandbox_id): @monitor_sandbox_operation() async def get_status(self, sandbox_id) -> SandboxStatusResponse: async with self._ray_service.get_ray_rwlock().read_lock(): - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to get status") - else: - remote_status: ServiceStatus = await self.async_ray_get(sandbox_actor.get_status.remote()) - alive = await self.async_ray_get(sandbox_actor.is_alive.remote()) - sandbox_info: SandboxInfo = None - if self._redis_provider: - sandbox_info = await build_sandbox_from_redis(self._redis_provider, sandbox_id) - if sandbox_info is None: - # The start() method will write to redis on the first call to get_status() - sandbox_info = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) - sandbox_info.update(remote_status.to_dict()) - if alive.is_alive: - sandbox_info["state"] = State.RUNNING - await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) - await self._update_expire_time(sandbox_id) - logger.info(f"sandbox {sandbox_id} status is {sandbox_info}, write to redis") - else: - sandbox_info = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) - - return SandboxStatusResponse( - sandbox_id=sandbox_id, - status=remote_status.phases, - state=sandbox_info.get("state"), - port_mapping=remote_status.get_port_mapping(), - host_name=sandbox_info.get("host_name"), - host_ip=sandbox_info.get("host_ip"), - is_alive=alive.is_alive, - image=sandbox_info.get("image"), - swe_rex_version=swe_version, - gateway_version=gateway_version, - user_id=sandbox_info.get("user_id"), - experiment_id=sandbox_info.get("experiment_id"), - namespace=sandbox_info.get("namespace"), - cpus=sandbox_info.get("cpus"), - memory=sandbox_info.get("memory"), - ) + response: SandboxStatusResponse = await self._deployment_service.get_status(sandbox_id) + sandbox_info: SandboxInfo = self.get_info_from_response(response) + if self._redis_provider: + await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) + await self._update_expire_time(sandbox_id) + # logger.info(f"sandbox {sandbox_id} status is {remote_status}, write to redis") + return response async def _get_sandbox_info(self, sandbox_id: str) -> SandboxInfo: """Get sandbox info, prioritize Redis, fallback to Ray Actor""" @@ -363,59 +235,20 @@ async def get_remote_status(self, sandbox_id: str, host_ip: str) -> ServiceStatu error_msg = ( f"get_remote_status failed! {response.get('failure_reason') if response.get('failure_reason') else ''}" ) - raise Exception(error_msg) - - async def create_session(self, request: CreateSessionRequest) -> CreateBashSessionResponse: - sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {request.sandbox_id} not found to create session") - await self._update_expire_time(request.sandbox_id) - return await self.async_ray_get(sandbox_actor.create_session.remote(request)) - - @monitor_sandbox_operation() - async def run_in_session(self, action: Action) -> BashObservation: - sandbox_actor = await self.async_ray_get_actor(action.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {action.sandbox_id} not found to run in session") - await self._update_expire_time(action.sandbox_id) - return await self.async_ray_get(sandbox_actor.run_in_session.remote(action)) - - async def close_session(self, request: CloseBashSessionRequest) -> CloseBashSessionResponse: - sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {request.sandbox_id} not found to close session") - await self._update_expire_time(request.sandbox_id) - return await self.async_ray_get(sandbox_actor.close_session.remote(request)) - - async def execute(self, command: Command) -> CommandResponse: - sandbox_actor = await self.async_ray_get_actor(command.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {command.sandbox_id} not found to execute") - await self._update_expire_time(command.sandbox_id) - return await self.async_ray_get(sandbox_actor.execute.remote(command)) - - async def read_file(self, request: ReadFileRequest) -> ReadFileResponse: - sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {request.sandbox_id} not found to read file") - await self._update_expire_time(request.sandbox_id) - return await self.async_ray_get(sandbox_actor.read_file.remote(request)) - - @monitor_sandbox_operation() - async def write_file(self, request: WriteFileRequest) -> WriteFileResponse: - sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {request.sandbox_id} not found to write file") - await self._update_expire_time(request.sandbox_id) - return await self.async_ray_get(sandbox_actor.write_file.remote(request)) - - @monitor_sandbox_operation() - async def upload(self, file: UploadFile, target_path: str, sandbox_id: str) -> UploadResponse: - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to upload file") - await self._update_expire_time(sandbox_id) - return await self.async_ray_get(sandbox_actor.upload.remote(file, target_path)) + raise Exception(error_msg) + + def get_info_from_response(self, response: SandboxStatusResponse) -> SandboxInfo: + return SandboxInfo( + host_name=response.host_name, + host_ip=response.host_ip, + user_id=response.user_id, + experiment_id=response.experiment_id, + namespace=response.namespace, + sandbox_id=response.sandbox_id, + cpus=response.cpus, + memory=response.memory, + port_mapping=response.port_mapping, + ) async def _is_expired(self, sandbox_id): timeout_dict = await self._redis_provider.json_get(timeout_sandbox_key(sandbox_id), "$") @@ -429,12 +262,12 @@ async def _is_expired(self, sandbox_id): logger.info(f"sandbox_id:[{sandbox_id}] is already cleared") return True - async def _is_actor_alive(self, sandbox_id): + async def _is_deployment_alive(self, sandbox_id): try: - actor = await self.async_ray_get_actor(sandbox_id) - return actor is not None + deployment = await self._deployment_service.get_deployment(sandbox_id) + return deployment is not None except Exception as e: - logger.error("get actor failed", exc_info=e) + logger.error("get deployment failed", exc_info=e) return False async def _check_job_background(self): @@ -456,8 +289,7 @@ async def _check_job_background(self): continue async def get_sandbox_statistics(self, sandbox_id): - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - resource_metrics = await self.async_ray_get(sandbox_actor.get_sandbox_statistics.remote()) + resource_metrics = await self._deployment_service.get_sandbox_statistics(sandbox_id) return resource_metrics async def _update_expire_time(self, sandbox_id): @@ -494,4 +326,4 @@ def validate_sandbox_spec(self, runtime_config: RuntimeConfig, deployment_config ) except ValueError as e: logger.warning(f"Invalid memory size: {deployment_config.memory}", exc_info=e) - raise BadRequestRockError(f"Invalid memory size: {self._config.memory}") + raise BadRequestRockError(f"Invalid memory size: {deployment_config.memory}") diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py new file mode 100644 index 000000000..8726812f6 --- /dev/null +++ b/rock/sandbox/service/deployment_service.py @@ -0,0 +1,177 @@ +from abc import abstractmethod +import asyncio +from rock.actions.sandbox.response import CommandResponse, State +from rock.actions.sandbox.sandbox_info import SandboxInfo +from rock.admin.proto.response import SandboxStartResponse, SandboxStatusResponse +from rock.deployments.abstract import AbstractDeployment +import ray +from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig +from rock.deployments.constants import Status +from rock.deployments.docker import DockerDeployment +from rock.deployments.ray import RayDeployment +from rock.deployments.status import ServiceStatus +from rock.logger import init_logger +from rock.sandbox.sandbox_actor import SandboxActor +from rock.sdk.common.exceptions import BadRequestRockError +from rock.utils.format import parse_memory_size +from rock.rocklet import __version__ as swe_version +from rock.sandbox import __version__ as gateway_version + +logger = init_logger(__name__) + + +class AbstractDeploymentService(): + @abstractmethod + async def get_deployment(self, sandbox_id: str) -> AbstractDeployment: + ... + + @abstractmethod + async def submit(self, config: DeploymentConfig, user_info: dict) -> SandboxStartResponse: + """Get status of sandbox.""" + ... + + @abstractmethod + async def get_status(self, *args, **kwargs) -> SandboxStatusResponse: + """Get status of sandbox.""" + ... + + @abstractmethod + async def stop(self, *args, **kwargs): + """Stop sandbox.""" + + @abstractmethod + async def get_mount(self, *args, **kwargs): + """Get mount of sandbox.""" + ... + + @abstractmethod + async def get_sandbox_statistics(self, *args, **kwargs): + """Get sandbox statistics.""" + ... + + @abstractmethod + async def commit(self, *args, **kwargs) -> CommandResponse: + ... + + +class RayDeploymentService(): + def __init__(self, ray_namespace: str): + self._ray_namespace = ray_namespace + + def _get_actor_name(self, sandbox_id): + return f"sandbox-{sandbox_id}" + + async def async_ray_get_actor(self, sandbox_id: str): + """Async wrapper for ray.get_actor() using asyncio.to_thread for non-blocking execution.""" + try: + actor_name = self._get_actor_name(sandbox_id) + result = await asyncio.to_thread(ray.get_actor, actor_name, namespace=self._ray_namespace) + except ValueError as e: + logger.error(f"ray get actor, actor {sandbox_id} not exist", exc_info=e) + raise e + except Exception as e: + logger.error("ray get actor failed", exc_info=e) + error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get actor failed, {str(e)}" + raise Exception(error_msg) + return result + + async def async_ray_get(self, ray_future: ray.ObjectRef): + """Async wrapper for ray.get() using asyncio.to_thread for non-blocking execution.""" + try: + # Use asyncio.to_thread to run ray.get in a thread pool without managing executor + result = await asyncio.to_thread(ray.get, ray_future, timeout=60) + except Exception as e: + logger.error("ray get failed", exc_info=e) + error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get failed, {str(e)}" + raise Exception(error_msg) + return result + + async def submit(self, config: DockerDeploymentConfig, user_info: dict) -> SandboxInfo: + sandbox_actor: SandboxActor = await self.creator_actor(config) + user_id = user_info.get("user_id", "default") + experiment_id = user_info.get("experiment_id", "default") + namespace = user_info.get("namespace", "default") + sandbox_actor.start.remote() + sandbox_actor.set_user_id.remote(user_id) + sandbox_actor.set_experiment_id.remote(experiment_id) + sandbox_actor.set_namespace.remote(namespace) + sandbox_info: SandboxInfo = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) + sandbox_info["user_id"] = user_id + sandbox_info["experiment_id"] = experiment_id + sandbox_info["namespace"] = namespace + sandbox_info["state"] = State.PENDING + return sandbox_info + + async def creator_actor(self, config: DockerDeploymentConfig): + actor_options = self._generate_actor_options(config) + deployment: DockerDeployment = config.get_deployment() + sandbox_actor = SandboxActor.options(**actor_options).remote(config, deployment) + return sandbox_actor + + def _generate_actor_options(self, config: DockerDeploymentConfig) -> dict: + actor_name = self._get_actor_name(config.container_name) + actor_options = {"name": actor_name, "lifetime": "detached"} + try: + memory = parse_memory_size(config.memory) + actor_options["num_cpus"] = config.cpus + actor_options["memory"] = memory + return actor_options + except ValueError as e: + logger.warning(f"Invalid memory size: {config.memory}", exc_info=e) + raise BadRequestRockError(f"Invalid memory size: {config.memory}") + + async def stop(self, sandbox_id: str): + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + await self.async_ray_get(actor.stop.remote()) + logger.info(f"run time stop over {sandbox_id}") + ray.kill(actor) + + async def get_status(self, sandbox_id: str) -> SandboxStatusResponse: + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + sandbox_info: SandboxInfo = await self.async_ray_get(actor.sandbox_info.remote()) + remote_status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) + alive = await self.async_ray_get(actor.is_alive.remote()) + if alive.is_alive: + sandbox_info["state"] = State.RUNNING + return SandboxStatusResponse( + sandbox_id=sandbox_id, + status=remote_status.phases, + port_mapping=remote_status.get_port_mapping(), + host_name=sandbox_info.get("host_name"), + host_ip=sandbox_info.get("host_ip"), + is_alive=alive.is_alive, + image=sandbox_info.get("image"), + swe_rex_version=swe_version, + gateway_version=gateway_version, + user_id=sandbox_info.get("user_id"), + experiment_id=sandbox_info.get("experiment_id"), + namespace=sandbox_info.get("namespace"), + cpus=sandbox_info.get("cpus"), + memory=sandbox_info.get("memory"), + state=sandbox_info.get("state"), + ) + + async def get_mount(self, sandbox_id: str): + actor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.get_mount.remote()) + logger.info(f"get_mount: {result}") + return result + + async def get_sandbox_statistics(self, sandbox_id: str): + actor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.get_sandbox_statistics.remote()) + logger.info(f"get_sandbox_statistics: {result}") + return result + + async def commit(self, *args, **kwargs) -> CommandResponse: + actor = await self._ray_actor + result = await self.async_ray_get(actor.commit.remote(*args, **kwargs)) + logger.info(f"commit: {result}") + return result + + # TODO: considering modify the result to deployment inside sandbox actor + async def get_deployment(self, sandbox_id: str) -> AbstractDeployment: + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) + logger.info(f"get_deployment: {status}") + return status.phases["docker_run"] == Status.RUNNING diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index b43e75283..bae79fa95 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -11,6 +11,7 @@ from rock.deployments.config import DockerDeploymentConfig from rock.logger import init_logger from rock.sandbox.sandbox_manager import SandboxManager +from rock.sandbox.service.deployment_service import RayDeploymentService from rock.sandbox.service.sandbox_proxy_service import SandboxProxyService from rock.utils.providers.redis_provider import RedisProvider from rock.admin.core.ray_service import RayService @@ -53,6 +54,10 @@ async def sandbox_manager(rock_config: RockConfig, redis_provider: RedisProvider ) return sandbox_manager +@pytest.fixture +async def ray_deployment_service(rock_config: RockConfig, ray_init_shutdown): + ray_deployment_service = RayDeploymentService(ray_namespace=rock_config.ray.namespace) + return ray_deployment_service @pytest.fixture async def sandbox_proxy_service(rock_config: RockConfig, redis_provider: RedisProvider): diff --git a/tests/unit/sandbox/service/test_deployment_service.py b/tests/unit/sandbox/service/test_deployment_service.py new file mode 100644 index 000000000..1217e794c --- /dev/null +++ b/tests/unit/sandbox/service/test_deployment_service.py @@ -0,0 +1,10 @@ +import pytest + + +@pytest.mark.need_ray +@pytest.mark.asyncio +async def test_get_actor_not_exist_raises_value_error(ray_deployment_service): + sandbox_id = "unknown" + with pytest.raises(Exception) as exc_info: + await ray_deployment_service.async_ray_get_actor(sandbox_id) + assert exc_info.type == ValueError \ No newline at end of file diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index 000241fff..5826a9a56 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -26,14 +26,15 @@ async def test_async_sandbox_start(sandbox_manager: SandboxManager): assert sandbox_id is not None search_start_time = time.time() while time.time() - search_start_time < 60: - is_alive_response = await sandbox_manager._is_actor_alive(sandbox_id) + is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) if is_alive_response: break - is_alive_response = await sandbox_manager._is_actor_alive(sandbox_id) + is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) assert is_alive_response - sandbox_actor = await sandbox_manager.async_ray_get_actor(sandbox_id) + # TODO: fix async_ray_get_actor for it is not a general method + sandbox_actor = await sandbox_manager._deployment_service.async_ray_get_actor(sandbox_id) assert sandbox_actor is not None assert await sandbox_actor.user_id.remote() == "default" assert await sandbox_actor.experiment_id.remote() == "default" @@ -71,12 +72,12 @@ async def test_ray_actor_is_alive(sandbox_manager): response = await sandbox_manager.start_async(docker_deploy_config) assert response.sandbox_id is not None - assert await sandbox_manager._is_actor_alive(response.sandbox_id) + assert await sandbox_manager._is_deployment_alive(response.sandbox_id) - sandbox_actor = await sandbox_manager.async_ray_get_actor(response.sandbox_id) + sandbox_actor = await sandbox_manager._deployment_service.async_ray_get_actor(response.sandbox_id) ray.kill(sandbox_actor) - assert not await sandbox_manager._is_actor_alive(response.sandbox_id) + assert not await sandbox_manager._is_deployment_alive(response.sandbox_id) @pytest.mark.need_ray @@ -88,7 +89,7 @@ async def test_user_info_set_success(sandbox_manager): cnt = 0 while True: - is_alive_response = await sandbox_manager._is_actor_alive(sandbox_id) + is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) if is_alive_response: break time.sleep(1) @@ -96,13 +97,15 @@ async def test_user_info_set_success(sandbox_manager): if cnt > 60: raise Exception("sandbox not alive") - is_alive_response = await sandbox_manager._is_actor_alive(sandbox_id) + is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) assert is_alive_response - sandbox_actor = await sandbox_manager.async_ray_get_actor(sandbox_id) - assert sandbox_actor is not None - assert await sandbox_actor.user_id.remote() == "test_user_id" - assert await sandbox_actor.experiment_id.remote() == "test_experiment_id" + sandbox_deployment = await sandbox_manager._deployment_service.get_deployment(sandbox_id) + assert sandbox_deployment is not None + + ray_actor = await sandbox_manager._deployment_service.async_ray_get_actor(sandbox_id) + assert await ray_actor.user_id.remote() == "test_user_id" + assert await ray_actor.experiment_id.remote() == "test_experiment_id" await sandbox_manager.stop(sandbox_id) @@ -171,12 +174,3 @@ async def test_sandbox_start_with_sandbox_id(sandbox_manager): logger.error(f"test_sandbox_start_with_sandbox_id error: {str(e)}", exc_info=True) finally: await sandbox_manager.stop(sandbox_id) - - -@pytest.mark.need_ray -@pytest.mark.asyncio -async def test_get_actor_not_exist_raises_value_error(sandbox_manager): - sandbox_id = "unknown" - with pytest.raises(Exception) as exc_info: - await sandbox_manager.async_ray_get_actor(sandbox_id) - assert exc_info.type == ValueError \ No newline at end of file From 11e69bef3913bf6dfe0594618aecfe21b1c1a6b6 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Wed, 21 Jan 2026 07:54:53 +0000 Subject: [PATCH 02/25] fix test case: revert session related apis --- rock/admin/entrypoints/sandbox_api.py | 18 ++++++++++++++++ rock/sandbox/sandbox_manager.py | 30 +++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py index b373e6210..0412067d0 100644 --- a/rock/admin/entrypoints/sandbox_api.py +++ b/rock/admin/entrypoints/sandbox_api.py @@ -91,6 +91,24 @@ async def get_status(sandbox_id: str): return RockResponse(result=await sandbox_manager.get_status(sandbox_id)) +@sandbox_router.post("/execute") +@handle_exceptions(error_message="execute command failed") +async def execute(command: SandboxCommand) -> RockResponse[CommandResponse]: + return RockResponse(result=await sandbox_manager.execute(command)) + + +@sandbox_router.post("/create_session") +@handle_exceptions(error_message="create session failed") +async def create_session(request: SandboxCreateBashSessionRequest) -> RockResponse[CreateBashSessionResponse]: + return RockResponse(result=await sandbox_manager.create_session(request)) + + +@sandbox_router.post("/run_in_session") +@handle_exceptions(error_message="run in session failed") +async def run(action: SandboxBashAction) -> RockResponse[BashObservation]: + return RockResponse(result=await sandbox_manager.run_in_session(action)) + + @sandbox_router.post("/stop") @handle_exceptions(error_message="stop sandbox failed") async def close(sandbox_id: str = Body(..., embed=True)) -> RockResponse[str]: diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index bc418641f..179b0826f 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -3,9 +3,17 @@ from rock import env_vars from rock.actions import ( + BashObservation, CommandResponse, + CreateBashSessionResponse, ) +<<<<<<< HEAD from rock.actions.sandbox.response import IsAliveResponse, State +======= +from rock.admin.proto.request import SandboxCreateSessionRequest as CreateSessionRequest +from rock.admin.proto.request import SandboxCommand as Command +from rock.admin.proto.request import SandboxAction as Action +>>>>>>> 14531e3 (fix test case: revert session related apis) from rock.actions.sandbox.sandbox_info import SandboxInfo from rock.admin.core.ray_service import RayService @@ -249,6 +257,28 @@ def get_info_from_response(self, response: SandboxStatusResponse) -> SandboxInfo memory=response.memory, port_mapping=response.port_mapping, ) + + async def create_session(self, request: CreateSessionRequest) -> CreateBashSessionResponse: + sandbox_actor = await self._deployment_service.async_ray_get_actor(request.sandbox_id) + if sandbox_actor is None: + raise Exception(f"sandbox {request.sandbox_id} not found to create session") + await self._update_expire_time(request.sandbox_id) + return await self._deployment_service.async_ray_get(sandbox_actor.create_session.remote(request)) + + async def execute(self, command: Command) -> CommandResponse: + sandbox_actor = await self._deployment_service.async_ray_get_actor(command.sandbox_id) + if sandbox_actor is None: + raise Exception(f"sandbox {command.sandbox_id} not found to execute") + await self._update_expire_time(command.sandbox_id) + return await self._deployment_service.async_ray_get(sandbox_actor.execute.remote(command)) + + @monitor_sandbox_operation() + async def run_in_session(self, action: Action) -> BashObservation: + sandbox_actor = await self._deployment_service.async_ray_get_actor(action.sandbox_id) + if sandbox_actor is None: + raise Exception(f"sandbox {action.sandbox_id} not found to run in session") + await self._update_expire_time(action.sandbox_id) + return await self._deployment_service.async_ray_get(sandbox_actor.run_in_session.remote(action)) async def _is_expired(self, sandbox_id): timeout_dict = await self._redis_provider.json_get(timeout_sandbox_key(sandbox_id), "$") From 4063cb8b824c1c0c47c909ea08a59f7432daa705 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Wed, 21 Jan 2026 09:55:00 +0000 Subject: [PATCH 03/25] fix test case: add rock auth to sandbox info --- rock/sandbox/sandbox_manager.py | 14 +++++++------- rock/sandbox/service/deployment_service.py | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 179b0826f..bbea7638d 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -265,13 +265,6 @@ async def create_session(self, request: CreateSessionRequest) -> CreateBashSessi await self._update_expire_time(request.sandbox_id) return await self._deployment_service.async_ray_get(sandbox_actor.create_session.remote(request)) - async def execute(self, command: Command) -> CommandResponse: - sandbox_actor = await self._deployment_service.async_ray_get_actor(command.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {command.sandbox_id} not found to execute") - await self._update_expire_time(command.sandbox_id) - return await self._deployment_service.async_ray_get(sandbox_actor.execute.remote(command)) - @monitor_sandbox_operation() async def run_in_session(self, action: Action) -> BashObservation: sandbox_actor = await self._deployment_service.async_ray_get_actor(action.sandbox_id) @@ -280,6 +273,13 @@ async def run_in_session(self, action: Action) -> BashObservation: await self._update_expire_time(action.sandbox_id) return await self._deployment_service.async_ray_get(sandbox_actor.run_in_session.remote(action)) + async def execute(self, command: Command) -> CommandResponse: + sandbox_actor = await self._deployment_service.async_ray_get_actor(command.sandbox_id) + if sandbox_actor is None: + raise Exception(f"sandbox {command.sandbox_id} not found to execute") + await self._update_expire_time(command.sandbox_id) + return await self._deployment_service.async_ray_get(sandbox_actor.execute.remote(command)) + async def _is_expired(self, sandbox_id): timeout_dict = await self._redis_provider.json_get(timeout_sandbox_key(sandbox_id), "$") if timeout_dict is None or len(timeout_dict) == 0: diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 8726812f6..24b0d267b 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -91,6 +91,7 @@ async def submit(self, config: DockerDeploymentConfig, user_info: dict) -> Sandb user_id = user_info.get("user_id", "default") experiment_id = user_info.get("experiment_id", "default") namespace = user_info.get("namespace", "default") + rock_authorization = user_info.get("rock_authorization", "default") sandbox_actor.start.remote() sandbox_actor.set_user_id.remote(user_id) sandbox_actor.set_experiment_id.remote(experiment_id) @@ -100,6 +101,7 @@ async def submit(self, config: DockerDeploymentConfig, user_info: dict) -> Sandb sandbox_info["experiment_id"] = experiment_id sandbox_info["namespace"] = namespace sandbox_info["state"] = State.PENDING + sandbox_info["rock_authorization"] = rock_authorization return sandbox_info async def creator_actor(self, config: DockerDeploymentConfig): From 1b8e903590434aed607e59892a014849c9bbc203 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Wed, 21 Jan 2026 12:22:39 +0000 Subject: [PATCH 04/25] fix test case: add rock auth in get status --- rock/actions/sandbox/sandbox_info.py | 1 + rock/sandbox/sandbox_manager.py | 54 ++++++++++++++++++---- rock/sandbox/service/deployment_service.py | 29 +++--------- 3 files changed, 54 insertions(+), 30 deletions(-) diff --git a/rock/actions/sandbox/sandbox_info.py b/rock/actions/sandbox/sandbox_info.py index 63bcdd9c0..dd0c96928 100644 --- a/rock/actions/sandbox/sandbox_info.py +++ b/rock/actions/sandbox/sandbox_info.py @@ -20,6 +20,7 @@ class SandboxInfo(TypedDict, total=False): create_user_gray_flag: bool cpus: float memory: str + alive: bool class SandboxListItem(SandboxInfo): diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index bbea7638d..f64885b52 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -7,13 +7,10 @@ CommandResponse, CreateBashSessionResponse, ) -<<<<<<< HEAD from rock.actions.sandbox.response import IsAliveResponse, State -======= from rock.admin.proto.request import SandboxCreateSessionRequest as CreateSessionRequest from rock.admin.proto.request import SandboxCommand as Command from rock.admin.proto.request import SandboxAction as Action ->>>>>>> 14531e3 (fix test case: revert session related apis) from rock.actions.sandbox.sandbox_info import SandboxInfo from rock.admin.core.ray_service import RayService @@ -36,8 +33,15 @@ trace_id_ctx_var, ) from rock.utils.format import parse_memory_size +<<<<<<< HEAD from rock.utils.providers.redis_provider import RedisProvider from rock.utils.service import build_sandbox_from_redis +======= +from rock.utils.providers import RedisProvider +from rock.admin.core.ray_service import RayService +from rock.rocklet import __version__ as swe_version +from rock.sandbox import __version__ as gateway_version +>>>>>>> ddca701 (fix test case: add rock auth in get status) logger = init_logger(__name__) @@ -71,7 +75,6 @@ async def submit(self, config: DeploymentConfig, user_info: dict = {}): async with self._ray_service.get_ray_rwlock().read_lock(): deployment_config: DeploymentConfig = await self.deployment_manager.init_config(config) sandbox_id = deployment_config.container_name - # deployment: AbstractDeployment = deployment_config.get_deployment() self.validate_sandbox_spec(self.rock_config.runtime, config) self._sandbox_meta[sandbox_id] = {"image": deployment_config.image} sandbox_info: SandboxInfo = await self._deployment_service.submit(deployment_config, user_info) @@ -134,14 +137,23 @@ async def _clear_redis_keys(self, sandbox_id): @monitor_sandbox_operation() async def get_status(self, sandbox_id) -> SandboxStatusResponse: async with self._ray_service.get_ray_rwlock().read_lock(): - response: SandboxStatusResponse = await self._deployment_service.get_status(sandbox_id) - sandbox_info: SandboxInfo = self.get_info_from_response(response) + deployment_info: SandboxInfo = await self._deployment_service.get_status(sandbox_id) + sandbox_info: SandboxInfo = None if self._redis_provider: + sandbox_info = await self.build_sandbox_info_from_redis(sandbox_id) + if sandbox_info is None: + sandbox_info = deployment_info + else: + sandbox_info["state"] = deployment_info.get("state") await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) await self._update_expire_time(sandbox_id) - # logger.info(f"sandbox {sandbox_id} status is {remote_status}, write to redis") - return response + remote_info = {k: v for k, v in deployment_info.items() if k in ['status', 'port_mapping', 'alive']} + sandbox_info.update(remote_info) + logger.info(f"sandbox {sandbox_id} status is {sandbox_info}, write to redis") + else: + sandbox_info = deployment_info +<<<<<<< HEAD async def _get_sandbox_info(self, sandbox_id: str) -> SandboxInfo: """Get sandbox info, prioritize Redis, fallback to Ray Actor""" if self._redis_provider: @@ -257,6 +269,32 @@ def get_info_from_response(self, response: SandboxStatusResponse) -> SandboxInfo memory=response.memory, port_mapping=response.port_mapping, ) +======= + return SandboxStatusResponse( + sandbox_id=sandbox_id, + status=sandbox_info.get("status"), + state=sandbox_info.get("state"), + port_mapping=sandbox_info.get("port_mapping"), + host_name=sandbox_info.get("host_name"), + host_ip=sandbox_info.get("host_ip"), + is_alive=sandbox_info.get("alive"), + image=sandbox_info.get("image"), + swe_rex_version=swe_version, + gateway_version=gateway_version, + user_id=sandbox_info.get("user_id"), + experiment_id=sandbox_info.get("experiment_id"), + namespace=sandbox_info.get("namespace"), + cpus=sandbox_info.get("cpus"), + memory=sandbox_info.get("memory"), + ) + + async def build_sandbox_info_from_redis(self, sandbox_id: str) -> SandboxInfo | None: + if self._redis_provider: + sandbox_status = await self._redis_provider.json_get(alive_sandbox_key(sandbox_id), "$") + if sandbox_status and len(sandbox_status) > 0: + return sandbox_status[0] + return None +>>>>>>> ddca701 (fix test case: add rock auth in get status) async def create_session(self, request: CreateSessionRequest) -> CreateBashSessionResponse: sandbox_actor = await self._deployment_service.async_ray_get_actor(request.sandbox_id) diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 24b0d267b..d1a199e0d 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -14,8 +14,6 @@ from rock.sandbox.sandbox_actor import SandboxActor from rock.sdk.common.exceptions import BadRequestRockError from rock.utils.format import parse_memory_size -from rock.rocklet import __version__ as swe_version -from rock.sandbox import __version__ as gateway_version logger = init_logger(__name__) @@ -26,12 +24,12 @@ async def get_deployment(self, sandbox_id: str) -> AbstractDeployment: ... @abstractmethod - async def submit(self, config: DeploymentConfig, user_info: dict) -> SandboxStartResponse: + async def submit(self, config: DeploymentConfig, user_info: dict) -> SandboxInfo: """Get status of sandbox.""" ... @abstractmethod - async def get_status(self, *args, **kwargs) -> SandboxStatusResponse: + async def get_status(self, *args, **kwargs) -> SandboxInfo: """Get status of sandbox.""" ... @@ -128,30 +126,17 @@ async def stop(self, sandbox_id: str): logger.info(f"run time stop over {sandbox_id}") ray.kill(actor) - async def get_status(self, sandbox_id: str) -> SandboxStatusResponse: + async def get_status(self, sandbox_id: str) -> SandboxInfo: actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) sandbox_info: SandboxInfo = await self.async_ray_get(actor.sandbox_info.remote()) remote_status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) + sandbox_info["status"] = remote_status.phases + sandbox_info["port_mapping"] = remote_status.get_port_mapping() alive = await self.async_ray_get(actor.is_alive.remote()) + sandbox_info["alive"] = alive.is_alive if alive.is_alive: sandbox_info["state"] = State.RUNNING - return SandboxStatusResponse( - sandbox_id=sandbox_id, - status=remote_status.phases, - port_mapping=remote_status.get_port_mapping(), - host_name=sandbox_info.get("host_name"), - host_ip=sandbox_info.get("host_ip"), - is_alive=alive.is_alive, - image=sandbox_info.get("image"), - swe_rex_version=swe_version, - gateway_version=gateway_version, - user_id=sandbox_info.get("user_id"), - experiment_id=sandbox_info.get("experiment_id"), - namespace=sandbox_info.get("namespace"), - cpus=sandbox_info.get("cpus"), - memory=sandbox_info.get("memory"), - state=sandbox_info.get("state"), - ) + return sandbox_info async def get_mount(self, sandbox_id: str): actor = await self.async_ray_get_actor(sandbox_id) From 8a74b11ea74ccc9cd3a77dcca04afe57520e2070 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Wed, 21 Jan 2026 14:32:24 +0000 Subject: [PATCH 05/25] temproray remain apis for test case --- rock/admin/entrypoints/sandbox_api.py | 20 +++++++++++++ rock/sandbox/sandbox_manager.py | 42 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py index 0412067d0..6f510007f 100644 --- a/rock/admin/entrypoints/sandbox_api.py +++ b/rock/admin/entrypoints/sandbox_api.py @@ -108,6 +108,26 @@ async def create_session(request: SandboxCreateBashSessionRequest) -> RockRespon async def run(action: SandboxBashAction) -> RockResponse[BashObservation]: return RockResponse(result=await sandbox_manager.run_in_session(action)) +@sandbox_router.post("/read_file") +@handle_exceptions(error_message="read file failed") +async def read_file(request: SandboxReadFileRequest) -> RockResponse[ReadFileResponse]: + return RockResponse(result=await sandbox_manager.read_file(request)) + + +@sandbox_router.post("/write_file") +@handle_exceptions(error_message="write file failed") +async def write_file(request: SandboxWriteFileRequest) -> RockResponse[WriteFileResponse]: + return RockResponse(result=await sandbox_manager.write_file(request)) + + +@sandbox_router.post("/upload") +@handle_exceptions(error_message="upload file failed") +async def upload( + file: UploadFile = File(...), + target_path: str = Form(...), + sandbox_id: str | None = Form(None), +) -> RockResponse[UploadResponse]: + return RockResponse(result=await sandbox_manager.upload(file, target_path, sandbox_id)) @sandbox_router.post("/stop") @handle_exceptions(error_message="stop sandbox failed") diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index f64885b52..c282977c1 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -1,11 +1,16 @@ import asyncio import time +from fastapi import UploadFile +import ray from rock import env_vars from rock.actions import ( BashObservation, CommandResponse, CreateBashSessionResponse, + ReadFileResponse, + UploadResponse, + WriteFileResponse, ) from rock.actions.sandbox.response import IsAliveResponse, State from rock.admin.proto.request import SandboxCreateSessionRequest as CreateSessionRequest @@ -17,12 +22,15 @@ from rock.admin.core.redis_key import ALIVE_PREFIX, alive_sandbox_key, timeout_sandbox_key from rock.admin.metrics.decorator import monitor_sandbox_operation from rock.admin.proto.response import SandboxStartResponse, SandboxStatusResponse +from rock.admin.proto.request import SandboxReadFileRequest as ReadFileRequest +from rock.admin.proto.request import SandboxWriteFileRequest as WriteFileRequest from rock.config import RockConfig, RuntimeConfig from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig from rock.deployments.constants import Port from rock.deployments.status import PersistedServiceStatus, ServiceStatus from rock.deployments.abstract import AbstractDeployment from rock.deployments.config import DeploymentConfig + from rock.logger import init_logger from rock.sandbox.base_manager import BaseManager from rock.sandbox.service.deployment_service import AbstractDeploymentService, RayDeploymentService @@ -66,6 +74,14 @@ def __init__( self._deployment_service = RayDeploymentService(ray_namespace=ray_namespace) logger.info("sandbox service init success") + # TODO:remain for test, delete it after test refactor + async def async_ray_get_actor(self, sandbox_id: str): + return await self._deployment_service.async_ray_get_actor(sandbox_id) + + # TODO:remain for test, delete it after test refactor + async def async_ray_get(self, ray_future: ray.ObjectRef): + return await self._deployment_service.async_ray_get(ray_future) + @monitor_sandbox_operation() async def start_async(self, config: DeploymentConfig, user_info: dict = {}) -> SandboxStartResponse: return await self.submit(config, user_info) @@ -317,6 +333,32 @@ async def execute(self, command: Command) -> CommandResponse: raise Exception(f"sandbox {command.sandbox_id} not found to execute") await self._update_expire_time(command.sandbox_id) return await self._deployment_service.async_ray_get(sandbox_actor.execute.remote(command)) + + # TODO:remain for test, delete it after test refactor + async def read_file(self, request: ReadFileRequest) -> ReadFileResponse: + sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) + if sandbox_actor is None: + raise Exception(f"sandbox {request.sandbox_id} not found to read file") + await self._update_expire_time(request.sandbox_id) + return await self.async_ray_get(sandbox_actor.read_file.remote(request)) + + # TODO:remain for test, delete it after test refactor + @monitor_sandbox_operation() + async def write_file(self, request: WriteFileRequest) -> WriteFileResponse: + sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) + if sandbox_actor is None: + raise Exception(f"sandbox {request.sandbox_id} not found to write file") + await self._update_expire_time(request.sandbox_id) + return await self.async_ray_get(sandbox_actor.write_file.remote(request)) + + # TODO:remain for test, delete it after test refactor + @monitor_sandbox_operation() + async def upload(self, file: UploadFile, target_path: str, sandbox_id: str) -> UploadResponse: + sandbox_actor = await self.async_ray_get_actor(sandbox_id) + if sandbox_actor is None: + raise Exception(f"sandbox {sandbox_id} not found to upload file") + await self._update_expire_time(sandbox_id) + return await self.async_ray_get(sandbox_actor.upload.remote(file, target_path)) async def _is_expired(self, sandbox_id): timeout_dict = await self._redis_provider.json_get(timeout_sandbox_key(sandbox_id), "$") From fdef1a1bc8ef54a1236fda97ecde7c3afc331063 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Thu, 22 Jan 2026 07:14:49 +0000 Subject: [PATCH 06/25] remove ray dependency from sandbox manager layer --- rock/sandbox/gem_manager.py | 45 ++++------------ rock/sandbox/sandbox_manager.py | 47 ++++------------ rock/sandbox/service/deployment_service.py | 63 ++++++++++++++++++++-- 3 files changed, 78 insertions(+), 77 deletions(-) diff --git a/rock/sandbox/gem_manager.py b/rock/sandbox/gem_manager.py index 742d0972a..6c20b66d1 100644 --- a/rock/sandbox/gem_manager.py +++ b/rock/sandbox/gem_manager.py @@ -16,7 +16,6 @@ from rock.admin.proto.response import SandboxStartResponse, SandboxStatusResponse from rock.config import RockConfig from rock.deployments.config import DockerDeploymentConfig -from rock.sandbox.sandbox_actor import SandboxActor from rock.sandbox.sandbox_manager import SandboxManager from rock.utils.providers import RedisProvider from rock.admin.core.ray_service import RayService @@ -35,7 +34,7 @@ def __init__( async def env_make(self, env_id: str) -> EnvMakeResponse: config = DockerDeploymentConfig(image=env_vars.ROCK_ENVHUB_DEFAULT_DOCKER_IMAGE) - sandbox_start_response: SandboxStartResponse = await self.start_async(config=config) + sandbox_start_response: SandboxStartResponse = await self.submit(config=config) async def wait_until_alive(sandbox_id: str, interval: float = 1.0): """Internal polling method""" @@ -53,44 +52,22 @@ async def wait_until_alive(sandbox_id: str, interval: float = 1.0): except asyncio.TimeoutError: raise Exception("Sandbox startup timeout after 300s") - sandbox_actor: SandboxActor = await self.async_ray_get_actor(sandbox_start_response.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_start_response.sandbox_id} not found to stop") - response = await self.async_ray_get( - sandbox_actor.env_make.remote( - EnvMakeRequest( - env_id=env_id, - sandbox_id=sandbox_start_response.sandbox_id, - ) + make_response = await self._deployment_service.env_make( + EnvMakeRequest( + env_id=env_id, + sandbox_id=sandbox_start_response.sandbox_id, ) ) - return response - + return make_response + async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: - sandbox_id = request.sandbox_id - sandbox_actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to stop") - return await self.async_ray_get(sandbox_actor.env_step.remote(request)) + return await self._deployment_service.env_step(request) async def env_reset(self, request: EnvResetRequest) -> EnvResetResponse: - sandbox_id = request.sandbox_id - sandbox_actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to stop") - return await self.async_ray_get(sandbox_actor.env_reset.remote(request)) + return await self._deployment_service.env_reset(request) async def env_close(self, request: EnvCloseRequest) -> EnvCloseResponse: - sandbox_id = request.sandbox_id - sandbox_actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to stop") - response = await self.async_ray_get(sandbox_actor.env_close.remote(request)) - await self.stop(sandbox_id=sandbox_id) - return response + return await self._deployment_service.env_close(request) async def env_list(self, sandbox_id: str) -> EnvListResponse: - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to stop") - return await self.async_ray_get(sandbox_actor.env_list.remote()) + return await self._deployment_service.env_list(sandbox_id) diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index c282977c1..635a37ea3 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -1,7 +1,6 @@ import asyncio import time from fastapi import UploadFile -import ray from rock import env_vars from rock.actions import ( @@ -34,6 +33,7 @@ from rock.logger import init_logger from rock.sandbox.base_manager import BaseManager from rock.sandbox.service.deployment_service import AbstractDeploymentService, RayDeploymentService +from rock.sandbox.service.sandbox_proxy_service import SandboxProxyService from rock.sdk.common.exceptions import BadRequestRockError from rock.utils import ( EAGLE_EYE_TRACE_ID, @@ -57,6 +57,7 @@ class SandboxManager(BaseManager): _ray_namespace: str = None _deployment_service: AbstractDeploymentService = None + _proxy_service: SandboxProxyService = None def __init__( self, @@ -72,15 +73,9 @@ def __init__( self._ray_service = ray_service self._ray_namespace = ray_namespace self._deployment_service = RayDeploymentService(ray_namespace=ray_namespace) + self._proxy_service = SandboxProxyService(rock_config, redis_provider) logger.info("sandbox service init success") - # TODO:remain for test, delete it after test refactor - async def async_ray_get_actor(self, sandbox_id: str): - return await self._deployment_service.async_ray_get_actor(sandbox_id) - - # TODO:remain for test, delete it after test refactor - async def async_ray_get(self, ray_future: ray.ObjectRef): - return await self._deployment_service.async_ray_get(ray_future) @monitor_sandbox_operation() async def start_async(self, config: DeploymentConfig, user_info: dict = {}) -> SandboxStartResponse: @@ -313,52 +308,28 @@ async def build_sandbox_info_from_redis(self, sandbox_id: str) -> SandboxInfo | >>>>>>> ddca701 (fix test case: add rock auth in get status) async def create_session(self, request: CreateSessionRequest) -> CreateBashSessionResponse: - sandbox_actor = await self._deployment_service.async_ray_get_actor(request.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {request.sandbox_id} not found to create session") - await self._update_expire_time(request.sandbox_id) - return await self._deployment_service.async_ray_get(sandbox_actor.create_session.remote(request)) + return await self._proxy_service.create_session(request) @monitor_sandbox_operation() async def run_in_session(self, action: Action) -> BashObservation: - sandbox_actor = await self._deployment_service.async_ray_get_actor(action.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {action.sandbox_id} not found to run in session") - await self._update_expire_time(action.sandbox_id) - return await self._deployment_service.async_ray_get(sandbox_actor.run_in_session.remote(action)) + return await self._proxy_service.run_in_session(action) async def execute(self, command: Command) -> CommandResponse: - sandbox_actor = await self._deployment_service.async_ray_get_actor(command.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {command.sandbox_id} not found to execute") - await self._update_expire_time(command.sandbox_id) - return await self._deployment_service.async_ray_get(sandbox_actor.execute.remote(command)) + return await self._proxy_service.execute(command) # TODO:remain for test, delete it after test refactor async def read_file(self, request: ReadFileRequest) -> ReadFileResponse: - sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {request.sandbox_id} not found to read file") - await self._update_expire_time(request.sandbox_id) - return await self.async_ray_get(sandbox_actor.read_file.remote(request)) + return await self._proxy_service.read_file(request) # TODO:remain for test, delete it after test refactor @monitor_sandbox_operation() async def write_file(self, request: WriteFileRequest) -> WriteFileResponse: - sandbox_actor = await self.async_ray_get_actor(request.sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {request.sandbox_id} not found to write file") - await self._update_expire_time(request.sandbox_id) - return await self.async_ray_get(sandbox_actor.write_file.remote(request)) + return await self._proxy_service.write_file(request) # TODO:remain for test, delete it after test refactor @monitor_sandbox_operation() async def upload(self, file: UploadFile, target_path: str, sandbox_id: str) -> UploadResponse: - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to upload file") - await self._update_expire_time(sandbox_id) - return await self.async_ray_get(sandbox_actor.upload.remote(file, target_path)) + return await self._proxy_service.upload(file, target_path, sandbox_id) async def _is_expired(self, sandbox_id): timeout_dict = await self._redis_provider.json_get(timeout_sandbox_key(sandbox_id), "$") diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index d1a199e0d..04dbf23bf 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -1,14 +1,14 @@ from abc import abstractmethod import asyncio +from rock.actions.envs.request import EnvCloseRequest, EnvMakeRequest, EnvResetRequest, EnvStepRequest +from rock.actions.envs.response import EnvCloseResponse, EnvListResponse, EnvMakeResponse, EnvResetResponse, EnvStepResponse from rock.actions.sandbox.response import CommandResponse, State from rock.actions.sandbox.sandbox_info import SandboxInfo -from rock.admin.proto.response import SandboxStartResponse, SandboxStatusResponse from rock.deployments.abstract import AbstractDeployment import ray from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig from rock.deployments.constants import Status from rock.deployments.docker import DockerDeployment -from rock.deployments.ray import RayDeployment from rock.deployments.status import ServiceStatus from rock.logger import init_logger from rock.sandbox.sandbox_actor import SandboxActor @@ -51,6 +51,25 @@ async def get_sandbox_statistics(self, *args, **kwargs): async def commit(self, *args, **kwargs) -> CommandResponse: ... + @abstractmethod + async def env_step(self, *args, **kwargs): + ... + + @abstractmethod + async def env_make(self, *args, **kwargs): + ... + + @abstractmethod + async def env_reset(self, *args, **kwargs): + ... + + @abstractmethod + async def env_list(self, *args, **kwargs): + ... + + @abstractmethod + async def env_close(self, *args, **kwargs): + ... class RayDeploymentService(): def __init__(self, ray_namespace: str): @@ -150,9 +169,9 @@ async def get_sandbox_statistics(self, sandbox_id: str): logger.info(f"get_sandbox_statistics: {result}") return result - async def commit(self, *args, **kwargs) -> CommandResponse: - actor = await self._ray_actor - result = await self.async_ray_get(actor.commit.remote(*args, **kwargs)) + async def commit(self, sandbox_id) -> CommandResponse: + actor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.commit.remote()) logger.info(f"commit: {result}") return result @@ -162,3 +181,37 @@ async def get_deployment(self, sandbox_id: str) -> AbstractDeployment: status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) logger.info(f"get_deployment: {status}") return status.phases["docker_run"] == Status.RUNNING + + async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.env_step.remote(request)) + logger.info(f"env_step: {result}") + return result + + async def env_make(self, request: EnvMakeRequest) -> EnvMakeResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.env_make.remote(request)) + logger.info(f"env_make: {result}") + return result + + async def env_reset(self, request: EnvResetRequest) -> EnvResetResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.env_reset.remote(request)) + logger.info(f"env_reset: {result}") + return result + + async def env_close(self, request: EnvCloseRequest) -> EnvCloseResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.env_close.remote(request)) + logger.info(f"env_close: {result}") + return result + + async def env_list(self, sandbox_id) -> EnvListResponse: + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.env_list.remote()) + logger.info(f"env_list: {result}") + return result From 4e152d7400a065f75c425f739e61a52e61d2824b Mon Sep 17 00:00:00 2001 From: daifangwen Date: Thu, 22 Jan 2026 09:49:09 +0000 Subject: [PATCH 07/25] fix test case: add fake redis for test env --- rock/admin/main.py | 6 +- rock/sandbox/sandbox_manager.py | 65 +++++++++---------- rock/sandbox/service/deployment_service.py | 2 +- rock/sandbox/service/sandbox_proxy_service.py | 1 + tests/integration/conftest.py | 2 +- 5 files changed, 37 insertions(+), 39 deletions(-) diff --git a/rock/admin/main.py b/rock/admin/main.py index 72f24646d..d8f3d3394 100644 --- a/rock/admin/main.py +++ b/rock/admin/main.py @@ -49,7 +49,11 @@ async def lifespan(app: FastAPI): env_vars.ROCK_ADMIN_ROLE = args.role # init redis provider - if args.env == "local": + if args.env == "test": + from fakeredis import aioredis + redis_provider = RedisProvider(host=None, port=None, password="") + redis_provider.client = aioredis.FakeRedis(decode_responses=True) + elif args.env == "local": redis_provider = None else: redis_provider = RedisProvider( diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 635a37ea3..a745154f9 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -41,15 +41,12 @@ trace_id_ctx_var, ) from rock.utils.format import parse_memory_size -<<<<<<< HEAD from rock.utils.providers.redis_provider import RedisProvider from rock.utils.service import build_sandbox_from_redis -======= from rock.utils.providers import RedisProvider from rock.admin.core.ray_service import RayService from rock.rocklet import __version__ as swe_version from rock.sandbox import __version__ as gateway_version ->>>>>>> ddca701 (fix test case: add rock auth in get status) logger = init_logger(__name__) @@ -151,20 +148,30 @@ async def get_status(self, sandbox_id) -> SandboxStatusResponse: deployment_info: SandboxInfo = await self._deployment_service.get_status(sandbox_id) sandbox_info: SandboxInfo = None if self._redis_provider: - sandbox_info = await self.build_sandbox_info_from_redis(sandbox_id) - if sandbox_info is None: - sandbox_info = deployment_info - else: - sandbox_info["state"] = deployment_info.get("state") + sandbox_info = await self.build_sandbox_info_from_redis(sandbox_id, deployment_info) await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) await self._update_expire_time(sandbox_id) - remote_info = {k: v for k, v in deployment_info.items() if k in ['status', 'port_mapping', 'alive']} - sandbox_info.update(remote_info) logger.info(f"sandbox {sandbox_id} status is {sandbox_info}, write to redis") else: sandbox_info = deployment_info + return SandboxStatusResponse( + sandbox_id=sandbox_id, + status=sandbox_info.get("phases"), + state=sandbox_info.get("state"), + port_mapping=sandbox_info.get("port_mapping"), + host_name=sandbox_info.get("host_name"), + host_ip=sandbox_info.get("host_ip"), + is_alive=sandbox_info.get("alive"), + image=sandbox_info.get("image"), + swe_rex_version=swe_version, + gateway_version=gateway_version, + user_id=sandbox_info.get("user_id"), + experiment_id=sandbox_info.get("experiment_id"), + namespace=sandbox_info.get("namespace"), + cpus=sandbox_info.get("cpus"), + memory=sandbox_info.get("memory"), + ) -<<<<<<< HEAD async def _get_sandbox_info(self, sandbox_id: str) -> SandboxInfo: """Get sandbox info, prioritize Redis, fallback to Ray Actor""" if self._redis_provider: @@ -280,32 +287,18 @@ def get_info_from_response(self, response: SandboxStatusResponse) -> SandboxInfo memory=response.memory, port_mapping=response.port_mapping, ) -======= - return SandboxStatusResponse( - sandbox_id=sandbox_id, - status=sandbox_info.get("status"), - state=sandbox_info.get("state"), - port_mapping=sandbox_info.get("port_mapping"), - host_name=sandbox_info.get("host_name"), - host_ip=sandbox_info.get("host_ip"), - is_alive=sandbox_info.get("alive"), - image=sandbox_info.get("image"), - swe_rex_version=swe_version, - gateway_version=gateway_version, - user_id=sandbox_info.get("user_id"), - experiment_id=sandbox_info.get("experiment_id"), - namespace=sandbox_info.get("namespace"), - cpus=sandbox_info.get("cpus"), - memory=sandbox_info.get("memory"), - ) - async def build_sandbox_info_from_redis(self, sandbox_id: str) -> SandboxInfo | None: - if self._redis_provider: - sandbox_status = await self._redis_provider.json_get(alive_sandbox_key(sandbox_id), "$") - if sandbox_status and len(sandbox_status) > 0: - return sandbox_status[0] - return None ->>>>>>> ddca701 (fix test case: add rock auth in get status) + async def build_sandbox_info_from_redis(self, sandbox_id: str, deployment_info: SandboxInfo) -> SandboxInfo | None: + sandbox_status = await self._redis_provider.json_get(alive_sandbox_key(sandbox_id), "$") + if sandbox_status and len(sandbox_status) > 0: + sandbox_info = sandbox_status[0] + remote_info = {k: v for k, v in deployment_info.items() if k in ['phases', 'port_mapping', 'alive', 'state']} + if 'phases' in remote_info and remote_info['phases']: + remote_info['phases'] = {name: phase.to_dict() for name, phase in remote_info['phases'].items()} + sandbox_info.update(remote_info) + else: + sandbox_info = deployment_info + return sandbox_info async def create_session(self, request: CreateSessionRequest) -> CreateBashSessionResponse: return await self._proxy_service.create_session(request) diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 04dbf23bf..1202017d3 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -149,7 +149,7 @@ async def get_status(self, sandbox_id: str) -> SandboxInfo: actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) sandbox_info: SandboxInfo = await self.async_ray_get(actor.sandbox_info.remote()) remote_status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) - sandbox_info["status"] = remote_status.phases + sandbox_info["phases"] = remote_status.phases sandbox_info["port_mapping"] = remote_status.get_port_mapping() alive = await self.async_ray_get(actor.is_alive.remote()) sandbox_info["alive"] = alive.is_alive diff --git a/rock/sandbox/service/sandbox_proxy_service.py b/rock/sandbox/service/sandbox_proxy_service.py index 8b896bf0f..07972ea23 100644 --- a/rock/sandbox/service/sandbox_proxy_service.py +++ b/rock/sandbox/service/sandbox_proxy_service.py @@ -88,6 +88,7 @@ async def create_session(self, request: CreateSessionRequest) -> CreateBashSessi sandbox_id = request.sandbox_id await self._update_expire_time(sandbox_id) sandbox_status_dicts = await self.get_service_status(sandbox_id) + print(f"sandbox status dicts: {sandbox_status_dicts}") response = await self._send_request( sandbox_id, sandbox_status_dicts[0], "create_session", None, request.model_dump(), None, "POST" ) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index e65fd79bb..0f2a42a0a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -106,7 +106,7 @@ def admin_remote_server(): [ "admin", "--env", - "local", + "test", "--role", "admin", "--port", From 3090dc94d493cda5d12dae4071e6460e5bade669 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Thu, 22 Jan 2026 12:02:33 +0000 Subject: [PATCH 08/25] fix test case: fix status code of run in session method --- rock/admin/entrypoints/sandbox_api.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rock/admin/entrypoints/sandbox_api.py b/rock/admin/entrypoints/sandbox_api.py index 6f510007f..59cbb9e68 100644 --- a/rock/admin/entrypoints/sandbox_api.py +++ b/rock/admin/entrypoints/sandbox_api.py @@ -13,6 +13,7 @@ UploadResponse, WriteFileResponse, ) +from rock.actions.response import ResponseStatus from rock.admin.proto.request import ( SandboxBashAction, SandboxCloseBashSessionRequest, @@ -106,7 +107,10 @@ async def create_session(request: SandboxCreateBashSessionRequest) -> RockRespon @sandbox_router.post("/run_in_session") @handle_exceptions(error_message="run in session failed") async def run(action: SandboxBashAction) -> RockResponse[BashObservation]: - return RockResponse(result=await sandbox_manager.run_in_session(action)) + result = await sandbox_manager.run_in_session(action) + if result.exit_code is not None and result.exit_code == -1: + return RockResponse(status=ResponseStatus.FAILED, error=result.failure_reason) + return RockResponse(result=result) @sandbox_router.post("/read_file") @handle_exceptions(error_message="read file failed") From d5db9f49bede995bde043168ba3dca6cc3499a89 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Thu, 22 Jan 2026 13:22:10 +0000 Subject: [PATCH 09/25] opt: remove get_deployment method in sandbox manager --- rock/sandbox/sandbox_manager.py | 15 +---- rock/sandbox/service/deployment_service.py | 16 ++--- tests/unit/sandbox/test_sandbox_manager.py | 71 ++++++++++------------ 3 files changed, 41 insertions(+), 61 deletions(-) diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index a745154f9..f7ec78a03 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -127,12 +127,7 @@ async def get_mount(self, sandbox_id): async def commit(self, sandbox_id, image_tag: str, username: str, password: str) -> CommandResponse: async with self._ray_service.get_ray_rwlock().read_lock(): logger.info(f"commit sandbox {sandbox_id}") - deployment: AbstractDeployment = await self._deployment_service.get_deployment(sandbox_id) - if deployment is None: - await self._clear_redis_keys(sandbox_id) - raise Exception(f"sandbox {sandbox_id} not found to commit") - logger.info(f"begin to commit {sandbox_id} to {image_tag}") - result = await deployment.commit(image_tag, username, password) + result = await self._deployment_service.commit(sandbox_id, image_tag, username, password) logger.info(f"commit {sandbox_id} to {image_tag} finished, result {result}") return result @@ -336,14 +331,6 @@ async def _is_expired(self, sandbox_id): logger.info(f"sandbox_id:[{sandbox_id}] is already cleared") return True - async def _is_deployment_alive(self, sandbox_id): - try: - deployment = await self._deployment_service.get_deployment(sandbox_id) - return deployment is not None - except Exception as e: - logger.error("get deployment failed", exc_info=e) - return False - async def _check_job_background(self): if not self._redis_provider: return diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 1202017d3..4b5a78164 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -20,7 +20,7 @@ class AbstractDeploymentService(): @abstractmethod - async def get_deployment(self, sandbox_id: str) -> AbstractDeployment: + async def is_deployment_alive(self, sandbox_id) -> bool: ... @abstractmethod @@ -78,6 +78,13 @@ def __init__(self, ray_namespace: str): def _get_actor_name(self, sandbox_id): return f"sandbox-{sandbox_id}" + async def is_deployment_alive(self, sandbox_id) -> bool: + try: + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + except ValueError: + return False + return await self.async_ray_get(actor.is_alive.remote()) + async def async_ray_get_actor(self, sandbox_id: str): """Async wrapper for ray.get_actor() using asyncio.to_thread for non-blocking execution.""" try: @@ -174,13 +181,6 @@ async def commit(self, sandbox_id) -> CommandResponse: result = await self.async_ray_get(actor.commit.remote()) logger.info(f"commit: {result}") return result - - # TODO: considering modify the result to deployment inside sandbox actor - async def get_deployment(self, sandbox_id: str) -> AbstractDeployment: - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) - logger.info(f"get_deployment: {status}") - return status.phases["docker_run"] == Status.RUNNING async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: sandbox_id = request.sandbox_id diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index 5826a9a56..f2f8ed4fe 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -21,23 +21,16 @@ @pytest.mark.need_ray @pytest.mark.asyncio async def test_async_sandbox_start(sandbox_manager: SandboxManager): - response = await sandbox_manager.start_async(DockerDeploymentConfig()) + response = await sandbox_manager.submit(DockerDeploymentConfig()) sandbox_id = response.sandbox_id assert sandbox_id is not None - search_start_time = time.time() - while time.time() - search_start_time < 60: - is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) - if is_alive_response: - break + assert wait_sandbox_instance_alive(sandbox_manager, sandbox_id) - is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) - assert is_alive_response + assert await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) - # TODO: fix async_ray_get_actor for it is not a general method - sandbox_actor = await sandbox_manager._deployment_service.async_ray_get_actor(sandbox_id) - assert sandbox_actor is not None - assert await sandbox_actor.user_id.remote() == "default" - assert await sandbox_actor.experiment_id.remote() == "default" + sandbox_status = await sandbox_manager.get_status(sandbox_id) + assert sandbox_status.user_id == "default" + assert sandbox_status.experiment_id == "default" await sandbox_manager.stop(sandbox_id) @@ -45,7 +38,7 @@ async def test_async_sandbox_start(sandbox_manager: SandboxManager): @pytest.mark.need_ray @pytest.mark.asyncio async def test_get_status(sandbox_manager): - response = await sandbox_manager.start_async(DockerDeploymentConfig(image="python:3.11")) + response = await sandbox_manager.submit(DockerDeploymentConfig(image="python:3.11")) await asyncio.sleep(5) docker_status: SandboxStatusResponse = await sandbox_manager.get_status(response.sandbox_id) assert docker_status.status["docker_run"] @@ -69,43 +62,32 @@ async def test_get_status(sandbox_manager): async def test_ray_actor_is_alive(sandbox_manager): docker_deploy_config = DockerDeploymentConfig() - response = await sandbox_manager.start_async(docker_deploy_config) + response = await sandbox_manager.submit(docker_deploy_config) assert response.sandbox_id is not None - assert await sandbox_manager._is_deployment_alive(response.sandbox_id) + assert wait_sandbox_instance_alive(sandbox_manager, response.sandbox_id) sandbox_actor = await sandbox_manager._deployment_service.async_ray_get_actor(response.sandbox_id) ray.kill(sandbox_actor) - assert not await sandbox_manager._is_deployment_alive(response.sandbox_id) + assert not await sandbox_manager._deployment_service.is_deployment_alive(response.sandbox_id) @pytest.mark.need_ray @pytest.mark.asyncio async def test_user_info_set_success(sandbox_manager): user_info = {"user_id": "test_user_id", "experiment_id": "test_experiment_id"} - response = await sandbox_manager.start_async(RayDeploymentConfig(), user_info=user_info) + response = await sandbox_manager.submit(RayDeploymentConfig(), user_info=user_info) sandbox_id = response.sandbox_id - cnt = 0 - while True: - is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) - if is_alive_response: - break - time.sleep(1) - cnt += 1 - if cnt > 60: - raise Exception("sandbox not alive") + assert wait_sandbox_instance_alive(sandbox_manager, sandbox_id) - is_alive_response = await sandbox_manager._is_deployment_alive(sandbox_id) + is_alive_response = await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) assert is_alive_response - sandbox_deployment = await sandbox_manager._deployment_service.get_deployment(sandbox_id) - assert sandbox_deployment is not None - - ray_actor = await sandbox_manager._deployment_service.async_ray_get_actor(sandbox_id) - assert await ray_actor.user_id.remote() == "test_user_id" - assert await ray_actor.experiment_id.remote() == "test_experiment_id" + sandbox_status = await sandbox_manager.get_status(sandbox_id) + assert sandbox_status.user_id == "test_user_id" + assert sandbox_status.experiment_id == "test_experiment_id" await sandbox_manager.stop(sandbox_id) @@ -121,7 +103,7 @@ def test_set_sandbox_status_response(): async def test_resource_limit_exception(sandbox_manager, docker_deployment_config): docker_deployment_config.cpus = 20 with pytest.raises(BadRequestRockError) as e: - await sandbox_manager.start_async(docker_deployment_config) + await sandbox_manager.submit(docker_deployment_config) logger.warning(f"Resource limit exception: {str(e)}", exc_info=True) @@ -130,7 +112,7 @@ async def test_resource_limit_exception(sandbox_manager, docker_deployment_confi async def test_resource_limit_exception_memory(sandbox_manager, docker_deployment_config): docker_deployment_config.memory = "65g" with pytest.raises(BadRequestRockError) as e: - await sandbox_manager.start_async(docker_deployment_config) + await sandbox_manager.submit(docker_deployment_config) logger.warning(f"Resource limit exception: {str(e)}", exc_info=True) @@ -147,7 +129,7 @@ async def test_get_system_resource_info(sandbox_manager): @pytest.mark.need_ray @pytest.mark.asyncio async def test_get_status_state(sandbox_manager): - response = await sandbox_manager.start_async( + response = await sandbox_manager.submit( DockerDeploymentConfig(), ) sandbox_id = response.sandbox_id @@ -162,11 +144,11 @@ async def test_get_status_state(sandbox_manager): async def test_sandbox_start_with_sandbox_id(sandbox_manager): try: sandbox_id = uuid.uuid4().hex - response = await sandbox_manager.start_async(DockerDeploymentConfig(container_name=sandbox_id)) + response = await sandbox_manager.submit(DockerDeploymentConfig(container_name=sandbox_id)) assert response.sandbox_id == sandbox_id await check_sandbox_status_until_alive(sandbox_manager, sandbox_id) with pytest.raises(BadRequestRockError) as e: - await sandbox_manager.start_async( + await sandbox_manager.submit( DockerDeploymentConfig(container_name=sandbox_id), sandbox_id=sandbox_id, ) @@ -174,3 +156,14 @@ async def test_sandbox_start_with_sandbox_id(sandbox_manager): logger.error(f"test_sandbox_start_with_sandbox_id error: {str(e)}", exc_info=True) finally: await sandbox_manager.stop(sandbox_id) + +async def wait_sandbox_instance_alive(sandbox_manager: SandboxManager, sandbox_id: str) -> bool: + cnt = 0 + while True: + is_alive_response = await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) + if is_alive_response: + return True + time.sleep(1) + cnt += 1 + if cnt > 60: + raise Exception("sandbox not alive") From cc1a2e2922e670a114410fdcfa83feeb2d790ad6 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Thu, 22 Jan 2026 13:50:07 +0000 Subject: [PATCH 10/25] opt: move ray service inside deployment service --- rock/sandbox/sandbox_manager.py | 135 ++++++++++----------- rock/sandbox/service/deployment_service.py | 99 ++++++++------- tests/unit/conftest.py | 4 +- tests/unit/sandbox/test_sandbox_manager.py | 6 +- 4 files changed, 123 insertions(+), 121 deletions(-) diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index f7ec78a03..c923e05a8 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -67,9 +67,8 @@ def __init__( super().__init__( rock_config, redis_provider=redis_provider, enable_runtime_auto_clear=enable_runtime_auto_clear ) - self._ray_service = ray_service self._ray_namespace = ray_namespace - self._deployment_service = RayDeploymentService(ray_namespace=ray_namespace) + self._deployment_service = RayDeploymentService(ray_namespace=ray_namespace, ray_service=ray_service) self._proxy_service = SandboxProxyService(rock_config, redis_provider) logger.info("sandbox service init success") @@ -80,56 +79,52 @@ async def start_async(self, config: DeploymentConfig, user_info: dict = {}) -> S @monitor_sandbox_operation() async def submit(self, config: DeploymentConfig, user_info: dict = {}): - async with self._ray_service.get_ray_rwlock().read_lock(): - deployment_config: DeploymentConfig = await self.deployment_manager.init_config(config) - sandbox_id = deployment_config.container_name - self.validate_sandbox_spec(self.rock_config.runtime, config) - self._sandbox_meta[sandbox_id] = {"image": deployment_config.image} - sandbox_info: SandboxInfo = await self._deployment_service.submit(deployment_config, user_info) - logger.info(f"sandbox {sandbox_id} is submitted") - - stop_time = str(int(time.time()) + deployment_config.auto_clear_time * 60) - auto_clear_time_dict = { - env_vars.ROCK_SANDBOX_AUTO_CLEAR_TIME_KEY: str(deployment_config.auto_clear_time), - env_vars.ROCK_SANDBOX_EXPIRE_TIME_KEY: stop_time, - } - if self._redis_provider: - await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) - await self._redis_provider.json_set(timeout_sandbox_key(sandbox_id), "$", auto_clear_time_dict) - - return SandboxStartResponse( - sandbox_id=sandbox_id, - host_name=sandbox_info.get("host_name"), - host_ip=sandbox_info.get("host_ip"), - ) + deployment_config: DeploymentConfig = await self.deployment_manager.init_config(config) + sandbox_id = deployment_config.container_name + self.validate_sandbox_spec(self.rock_config.runtime, config) + self._sandbox_meta[sandbox_id] = {"image": deployment_config.image} + sandbox_info: SandboxInfo = await self._deployment_service.submit(deployment_config, user_info) + logger.info(f"sandbox {sandbox_id} is submitted") + + stop_time = str(int(time.time()) + deployment_config.auto_clear_time * 60) + auto_clear_time_dict = { + env_vars.ROCK_SANDBOX_AUTO_CLEAR_TIME_KEY: str(deployment_config.auto_clear_time), + env_vars.ROCK_SANDBOX_EXPIRE_TIME_KEY: stop_time, + } + if self._redis_provider: + await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) + await self._redis_provider.json_set(timeout_sandbox_key(sandbox_id), "$", auto_clear_time_dict) + + return SandboxStartResponse( + sandbox_id=sandbox_id, + host_name=sandbox_info.get("host_name"), + host_ip=sandbox_info.get("host_ip"), + ) @monitor_sandbox_operation() async def stop(self, sandbox_id): - async with self._ray_service.get_ray_rwlock().read_lock(): - logger.info(f"stop sandbox {sandbox_id}") - try: - await self._deployment_service.stop(sandbox_id) - except ValueError as e: - logger.error(f"ray get actor, actor {sandbox_id} not exist", exc_info=e) - await self._clear_redis_keys(sandbox_id) - try: - self._sandbox_meta.pop(sandbox_id) - except KeyError: - logger.debug(f"{sandbox_id} key not found") - logger.info(f"sandbox {sandbox_id} stopped") + logger.info(f"stop sandbox {sandbox_id}") + try: + await self._deployment_service.stop(sandbox_id) + except ValueError as e: + logger.error(f"ray get actor, actor {sandbox_id} not exist", exc_info=e) await self._clear_redis_keys(sandbox_id) + try: + self._sandbox_meta.pop(sandbox_id) + except KeyError: + logger.debug(f"{sandbox_id} key not found") + logger.info(f"sandbox {sandbox_id} stopped") + await self._clear_redis_keys(sandbox_id) async def get_mount(self, sandbox_id): - async with self._ray_service.get_ray_rwlock().read_lock(): - return self._deployment_service.get_mount(sandbox_id) + return self._deployment_service.get_mount(sandbox_id) @monitor_sandbox_operation() async def commit(self, sandbox_id, image_tag: str, username: str, password: str) -> CommandResponse: - async with self._ray_service.get_ray_rwlock().read_lock(): - logger.info(f"commit sandbox {sandbox_id}") - result = await self._deployment_service.commit(sandbox_id, image_tag, username, password) - logger.info(f"commit {sandbox_id} to {image_tag} finished, result {result}") - return result + logger.info(f"commit sandbox {sandbox_id}") + result = await self._deployment_service.commit(sandbox_id, image_tag, username, password) + logger.info(f"commit {sandbox_id} to {image_tag} finished, result {result}") + return result async def _clear_redis_keys(self, sandbox_id): if self._redis_provider: @@ -139,33 +134,33 @@ async def _clear_redis_keys(self, sandbox_id): @monitor_sandbox_operation() async def get_status(self, sandbox_id) -> SandboxStatusResponse: - async with self._ray_service.get_ray_rwlock().read_lock(): - deployment_info: SandboxInfo = await self._deployment_service.get_status(sandbox_id) - sandbox_info: SandboxInfo = None - if self._redis_provider: - sandbox_info = await self.build_sandbox_info_from_redis(sandbox_id, deployment_info) - await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) - await self._update_expire_time(sandbox_id) - logger.info(f"sandbox {sandbox_id} status is {sandbox_info}, write to redis") - else: - sandbox_info = deployment_info - return SandboxStatusResponse( - sandbox_id=sandbox_id, - status=sandbox_info.get("phases"), - state=sandbox_info.get("state"), - port_mapping=sandbox_info.get("port_mapping"), - host_name=sandbox_info.get("host_name"), - host_ip=sandbox_info.get("host_ip"), - is_alive=sandbox_info.get("alive"), - image=sandbox_info.get("image"), - swe_rex_version=swe_version, - gateway_version=gateway_version, - user_id=sandbox_info.get("user_id"), - experiment_id=sandbox_info.get("experiment_id"), - namespace=sandbox_info.get("namespace"), - cpus=sandbox_info.get("cpus"), - memory=sandbox_info.get("memory"), - ) + deployment_info: SandboxInfo = await self._deployment_service.get_status(sandbox_id) + sandbox_info: SandboxInfo = None + if self._redis_provider: + sandbox_info = await self.build_sandbox_info_from_redis(sandbox_id, deployment_info) + await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) + await self._update_expire_time(sandbox_id) + logger.info(f"sandbox {sandbox_id} status is {sandbox_info}, write to redis") + else: + sandbox_info = deployment_info + + return SandboxStatusResponse( + sandbox_id=sandbox_id, + status=sandbox_info.get("phases"), + state=sandbox_info.get("state"), + port_mapping=sandbox_info.get("port_mapping"), + host_name=sandbox_info.get("host_name"), + host_ip=sandbox_info.get("host_ip"), + is_alive=sandbox_info.get("alive"), + image=sandbox_info.get("image"), + swe_rex_version=swe_version, + gateway_version=gateway_version, + user_id=sandbox_info.get("user_id"), + experiment_id=sandbox_info.get("experiment_id"), + namespace=sandbox_info.get("namespace"), + cpus=sandbox_info.get("cpus"), + memory=sandbox_info.get("memory"), + ) async def _get_sandbox_info(self, sandbox_id: str) -> SandboxInfo: """Get sandbox info, prioritize Redis, fallback to Ray Actor""" diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 4b5a78164..444d9d82d 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -4,10 +4,9 @@ from rock.actions.envs.response import EnvCloseResponse, EnvListResponse, EnvMakeResponse, EnvResetResponse, EnvStepResponse from rock.actions.sandbox.response import CommandResponse, State from rock.actions.sandbox.sandbox_info import SandboxInfo -from rock.deployments.abstract import AbstractDeployment +from rock.admin.core.ray_service import RayService import ray from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig -from rock.deployments.constants import Status from rock.deployments.docker import DockerDeployment from rock.deployments.status import ServiceStatus from rock.logger import init_logger @@ -72,8 +71,9 @@ async def env_close(self, *args, **kwargs): ... class RayDeploymentService(): - def __init__(self, ray_namespace: str): + def __init__(self, ray_namespace: str, ray_service: RayService): self._ray_namespace = ray_namespace + self._ray_service = ray_service def _get_actor_name(self, sandbox_id): return f"sandbox-{sandbox_id}" @@ -87,6 +87,7 @@ async def is_deployment_alive(self, sandbox_id) -> bool: async def async_ray_get_actor(self, sandbox_id: str): """Async wrapper for ray.get_actor() using asyncio.to_thread for non-blocking execution.""" + self._ray_service.increment_ray_request_count() try: actor_name = self._get_actor_name(sandbox_id) result = await asyncio.to_thread(ray.get_actor, actor_name, namespace=self._ray_namespace) @@ -101,8 +102,8 @@ async def async_ray_get_actor(self, sandbox_id: str): async def async_ray_get(self, ray_future: ray.ObjectRef): """Async wrapper for ray.get() using asyncio.to_thread for non-blocking execution.""" + self._ray_service.increment_ray_request_count() try: - # Use asyncio.to_thread to run ray.get in a thread pool without managing executor result = await asyncio.to_thread(ray.get, ray_future, timeout=60) except Exception as e: logger.error("ray get failed", exc_info=e) @@ -111,22 +112,23 @@ async def async_ray_get(self, ray_future: ray.ObjectRef): return result async def submit(self, config: DockerDeploymentConfig, user_info: dict) -> SandboxInfo: - sandbox_actor: SandboxActor = await self.creator_actor(config) - user_id = user_info.get("user_id", "default") - experiment_id = user_info.get("experiment_id", "default") - namespace = user_info.get("namespace", "default") - rock_authorization = user_info.get("rock_authorization", "default") - sandbox_actor.start.remote() - sandbox_actor.set_user_id.remote(user_id) - sandbox_actor.set_experiment_id.remote(experiment_id) - sandbox_actor.set_namespace.remote(namespace) - sandbox_info: SandboxInfo = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) - sandbox_info["user_id"] = user_id - sandbox_info["experiment_id"] = experiment_id - sandbox_info["namespace"] = namespace - sandbox_info["state"] = State.PENDING - sandbox_info["rock_authorization"] = rock_authorization - return sandbox_info + async with self._ray_service.get_ray_rwlock().read_lock(): + sandbox_actor: SandboxActor = await self.creator_actor(config) + user_id = user_info.get("user_id", "default") + experiment_id = user_info.get("experiment_id", "default") + namespace = user_info.get("namespace", "default") + rock_authorization = user_info.get("rock_authorization", "default") + sandbox_actor.start.remote() + sandbox_actor.set_user_id.remote(user_id) + sandbox_actor.set_experiment_id.remote(experiment_id) + sandbox_actor.set_namespace.remote(namespace) + sandbox_info: SandboxInfo = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) + sandbox_info["user_id"] = user_id + sandbox_info["experiment_id"] = experiment_id + sandbox_info["namespace"] = namespace + sandbox_info["state"] = State.PENDING + sandbox_info["rock_authorization"] = rock_authorization + return sandbox_info async def creator_actor(self, config: DockerDeploymentConfig): actor_options = self._generate_actor_options(config) @@ -147,40 +149,45 @@ def _generate_actor_options(self, config: DockerDeploymentConfig) -> dict: raise BadRequestRockError(f"Invalid memory size: {config.memory}") async def stop(self, sandbox_id: str): - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - await self.async_ray_get(actor.stop.remote()) - logger.info(f"run time stop over {sandbox_id}") - ray.kill(actor) + async with self._ray_service.get_ray_rwlock().read_lock(): + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + await self.async_ray_get(actor.stop.remote()) + logger.info(f"run time stop over {sandbox_id}") + ray.kill(actor) async def get_status(self, sandbox_id: str) -> SandboxInfo: - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - sandbox_info: SandboxInfo = await self.async_ray_get(actor.sandbox_info.remote()) - remote_status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) - sandbox_info["phases"] = remote_status.phases - sandbox_info["port_mapping"] = remote_status.get_port_mapping() - alive = await self.async_ray_get(actor.is_alive.remote()) - sandbox_info["alive"] = alive.is_alive - if alive.is_alive: - sandbox_info["state"] = State.RUNNING - return sandbox_info + async with self._ray_service.get_ray_rwlock().read_lock(): + actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + sandbox_info: SandboxInfo = await self.async_ray_get(actor.sandbox_info.remote()) + remote_status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) + sandbox_info["phases"] = remote_status.phases + sandbox_info["port_mapping"] = remote_status.get_port_mapping() + alive = await self.async_ray_get(actor.is_alive.remote()) + sandbox_info["alive"] = alive.is_alive + if alive.is_alive: + sandbox_info["state"] = State.RUNNING + return sandbox_info async def get_mount(self, sandbox_id: str): - actor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.get_mount.remote()) - logger.info(f"get_mount: {result}") - return result + with self._ray_service.get_ray_rwlock().read_lock(): + actor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.get_mount.remote()) + logger.info(f"get_mount: {result}") + return result async def get_sandbox_statistics(self, sandbox_id: str): - actor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.get_sandbox_statistics.remote()) - logger.info(f"get_sandbox_statistics: {result}") - return result + async with self._ray_service.get_ray_rwlock().read_lock(): + actor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.get_sandbox_statistics.remote()) + logger.info(f"get_sandbox_statistics: {result}") + return result async def commit(self, sandbox_id) -> CommandResponse: - actor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.commit.remote()) - logger.info(f"commit: {result}") - return result + with self._ray_service.get_ray_rwlock().read_lock(): + actor = await self.async_ray_get_actor(sandbox_id) + result = await self.async_ray_get(actor.commit.remote()) + logger.info(f"commit: {result}") + return result async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: sandbox_id = request.sandbox_id diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index bae79fa95..3529fa7e1 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -55,8 +55,8 @@ async def sandbox_manager(rock_config: RockConfig, redis_provider: RedisProvider return sandbox_manager @pytest.fixture -async def ray_deployment_service(rock_config: RockConfig, ray_init_shutdown): - ray_deployment_service = RayDeploymentService(ray_namespace=rock_config.ray.namespace) +async def ray_deployment_service(rock_config: RockConfig, ray_init_shutdown, ray_service): + ray_deployment_service = RayDeploymentService(ray_namespace=rock_config.ray.namespace, ray_service=ray_service) return ray_deployment_service @pytest.fixture diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index f2f8ed4fe..749779984 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -24,7 +24,7 @@ async def test_async_sandbox_start(sandbox_manager: SandboxManager): response = await sandbox_manager.submit(DockerDeploymentConfig()) sandbox_id = response.sandbox_id assert sandbox_id is not None - assert wait_sandbox_instance_alive(sandbox_manager, sandbox_id) + assert await wait_sandbox_instance_alive(sandbox_manager, sandbox_id) assert await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) @@ -65,7 +65,7 @@ async def test_ray_actor_is_alive(sandbox_manager): response = await sandbox_manager.submit(docker_deploy_config) assert response.sandbox_id is not None - assert wait_sandbox_instance_alive(sandbox_manager, response.sandbox_id) + assert await wait_sandbox_instance_alive(sandbox_manager, response.sandbox_id) sandbox_actor = await sandbox_manager._deployment_service.async_ray_get_actor(response.sandbox_id) ray.kill(sandbox_actor) @@ -80,7 +80,7 @@ async def test_user_info_set_success(sandbox_manager): response = await sandbox_manager.submit(RayDeploymentConfig(), user_info=user_info) sandbox_id = response.sandbox_id - assert wait_sandbox_instance_alive(sandbox_manager, sandbox_id) + assert await wait_sandbox_instance_alive(sandbox_manager, sandbox_id) is_alive_response = await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) assert is_alive_response From 5ac5088f34281d8d665cf4e760f8454ff44431e6 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Fri, 23 Jan 2026 06:10:38 +0000 Subject: [PATCH 11/25] remove ray dependency in base manager --- rock/actions/sandbox/response.py | 40 +++++++++++++++++++ rock/sandbox/base_manager.py | 27 +++++-------- rock/sandbox/sandbox_manager.py | 4 ++ rock/sandbox/service/deployment_service.py | 26 +++++++++++- .../service/test_deployment_service.py | 20 +++++++++- tests/unit/sandbox/test_sandbox_manager.py | 13 +++--- 6 files changed, 106 insertions(+), 24 deletions(-) diff --git a/rock/actions/sandbox/response.py b/rock/actions/sandbox/response.py index 54e405f78..6df085993 100644 --- a/rock/actions/sandbox/response.py +++ b/rock/actions/sandbox/response.py @@ -126,3 +126,43 @@ class ChownResponse(BaseModel): class ChmodResponse(BaseModel): success: bool = False message: str = "" + + +class SystemResourceMetrics(BaseModel): + """System resource metrics""" + + total_cpu: float = 0.0 + """Total CPU cores""" + + total_memory: float = 0.0 + """Total memory in GB""" + + available_cpu: float = 0.0 + """Available CPU cores""" + + available_memory: float = 0.0 + """Available memory in GB""" + + gpu_count: int = 0 + """Total GPU count""" + + available_gpu: int = 0 + """Available GPU count""" + + def get_cpu_utilization(self) -> float: + """Get CPU utilization rate (0.0 - 1.0)""" + if self.total_cpu == 0: + return 0.0 + return (self.total_cpu - self.available_cpu) / self.total_cpu + + def get_memory_utilization(self) -> float: + """Get memory utilization rate (0.0 - 1.0)""" + if self.total_memory == 0: + return 0.0 + return (self.total_memory - self.available_memory) / self.total_memory + + def get_gpu_utilization(self) -> float: + """Get GPU utilization rate (0.0 - 1.0)""" + if self.gpu_count == 0: + return 0.0 + return (self.gpu_count - self.available_gpu) / self.gpu_count diff --git a/rock/sandbox/base_manager.py b/rock/sandbox/base_manager.py index 497335719..9dd80dbd3 100644 --- a/rock/sandbox/base_manager.py +++ b/rock/sandbox/base_manager.py @@ -109,24 +109,17 @@ async def _collect_and_report_metrics_internal(self): logger.debug(f"Metrics overall report rt:{overall_duration:.4f}s") async def _report_system_resource_metrics(self): - """汇报系统资源指标""" - total_cpu, total_mem, available_cpu, available_mem = await self._collect_system_resource_metrics() - self.metrics_monitor.record_gauge_by_name(MetricsConstants.TOTAL_CPU_RESOURCE, total_cpu) - self.metrics_monitor.record_gauge_by_name(MetricsConstants.TOTAL_MEM_RESOURCE, total_mem) - self.metrics_monitor.record_gauge_by_name(MetricsConstants.AVAILABLE_CPU_RESOURCE, available_cpu) - self.metrics_monitor.record_gauge_by_name(MetricsConstants.AVAILABLE_MEM_RESOURCE, available_mem) - - # TODO: remove ray dependency in base manager, impl it in ray deployment service - async def _collect_system_resource_metrics(self): - """收集系统资源指标""" - cluster_resources = ray.cluster_resources() - available_resources = ray.available_resources() - total_cpu = cluster_resources.get("CPU", 0) - total_mem = cluster_resources.get("memory", 0) / 1024**3 - available_cpu = available_resources.get("CPU", 0) - available_mem = available_resources.get("memory", 0) / 1024**3 - return total_cpu, total_mem, available_cpu, available_mem + """Report system resource metrics""" + metrics = await self._collect_system_resource_metrics() + self.metrics_monitor.record_gauge_by_name(MetricsConstants.TOTAL_CPU_RESOURCE, metrics.total_cpu) + self.metrics_monitor.record_gauge_by_name(MetricsConstants.TOTAL_MEM_RESOURCE, metrics.total_memory) + self.metrics_monitor.record_gauge_by_name(MetricsConstants.AVAILABLE_CPU_RESOURCE, metrics.available_cpu) + self.metrics_monitor.record_gauge_by_name(MetricsConstants.AVAILABLE_MEM_RESOURCE, metrics.available_memory) + async def _collect_system_resource_metrics(self): + """Collect system resource metrics""" + raise NotImplementedError("This method should be implemented by subclasses") + async def _collect_sandbox_meta(self) -> tuple[int, dict[str, dict[str, str]]]: meta: dict = {} cnt = 0 diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index c923e05a8..777f805a9 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -383,3 +383,7 @@ def validate_sandbox_spec(self, runtime_config: RuntimeConfig, deployment_config except ValueError as e: logger.warning(f"Invalid memory size: {deployment_config.memory}", exc_info=e) raise BadRequestRockError(f"Invalid memory size: {deployment_config.memory}") + + async def _collect_system_resource_metrics(self): + """Collect system resource metrics, delegate to deployment service""" + return await self._deployment_service.collect_system_resource_metrics() diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 444d9d82d..0c48e59e1 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -2,7 +2,7 @@ import asyncio from rock.actions.envs.request import EnvCloseRequest, EnvMakeRequest, EnvResetRequest, EnvStepRequest from rock.actions.envs.response import EnvCloseResponse, EnvListResponse, EnvMakeResponse, EnvResetResponse, EnvStepResponse -from rock.actions.sandbox.response import CommandResponse, State +from rock.actions.sandbox.response import CommandResponse, State, SystemResourceMetrics from rock.actions.sandbox.sandbox_info import SandboxInfo from rock.admin.core.ray_service import RayService import ray @@ -70,6 +70,10 @@ async def env_list(self, *args, **kwargs): async def env_close(self, *args, **kwargs): ... + @abstractmethod + async def collect_system_resource_metrics(self) -> SystemResourceMetrics: + ... + class RayDeploymentService(): def __init__(self, ray_namespace: str, ray_service: RayService): self._ray_namespace = ray_namespace @@ -222,3 +226,23 @@ async def env_list(self, sandbox_id) -> EnvListResponse: result = await self.async_ray_get(actor.env_list.remote()) logger.info(f"env_list: {result}") return result + + async def collect_system_resource_metrics(self) -> SystemResourceMetrics: + """Collect system resource metrics""" + cluster_resources = ray.cluster_resources() + available_resources = ray.available_resources() + total_cpu = cluster_resources.get("CPU", 0) + total_mem = cluster_resources.get("memory", 0) / 1024**3 + available_cpu = available_resources.get("CPU", 0) + available_mem = available_resources.get("memory", 0) / 1024**3 + gpu_count = cluster_resources.get("GPU", 0) + available_gpu = available_resources.get("GPU", 0) + + return SystemResourceMetrics( + total_cpu=total_cpu, + total_memory=total_mem, + available_cpu=available_cpu, + available_memory=available_mem, + gpu_count=int(gpu_count), + available_gpu=int(available_gpu), + ) diff --git a/tests/unit/sandbox/service/test_deployment_service.py b/tests/unit/sandbox/service/test_deployment_service.py index 1217e794c..77b75fe14 100644 --- a/tests/unit/sandbox/service/test_deployment_service.py +++ b/tests/unit/sandbox/service/test_deployment_service.py @@ -1,4 +1,5 @@ import pytest +from rock.actions.sandbox.response import SystemResourceMetrics @pytest.mark.need_ray @@ -7,4 +8,21 @@ async def test_get_actor_not_exist_raises_value_error(ray_deployment_service): sandbox_id = "unknown" with pytest.raises(Exception) as exc_info: await ray_deployment_service.async_ray_get_actor(sandbox_id) - assert exc_info.type == ValueError \ No newline at end of file + assert exc_info.type == ValueError + + +@pytest.mark.need_ray +@pytest.mark.asyncio +async def test_collect_system_resource_metrics(ray_deployment_service): + metrics: SystemResourceMetrics = await ray_deployment_service.collect_system_resource_metrics() + assert metrics.total_cpu > 0 + assert metrics.total_memory > 0 + assert metrics.available_cpu >= 0 + assert metrics.available_memory >= 0 + assert metrics.available_cpu <= metrics.total_cpu + assert metrics.available_memory <= metrics.total_memory + # 测试利用率计算 + cpu_utilization = metrics.get_cpu_utilization() + assert 0.0 <= cpu_utilization <= 1.0 + memory_utilization = metrics.get_memory_utilization() + assert 0.0 <= memory_utilization <= 1.0 \ No newline at end of file diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index 749779984..df0821d69 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -119,11 +119,14 @@ async def test_resource_limit_exception_memory(sandbox_manager, docker_deploymen @pytest.mark.need_ray @pytest.mark.asyncio async def test_get_system_resource_info(sandbox_manager): - total_cpu, total_mem, ava_cpu, ava_mem = await sandbox_manager._collect_system_resource_metrics() - assert total_cpu > 0 - assert total_mem > 0 - assert ava_cpu > 0 - assert ava_mem > 0 + from rock.actions.sandbox.response import SystemResourceMetrics + metrics: SystemResourceMetrics = await sandbox_manager._collect_system_resource_metrics() + assert metrics.total_cpu > 0 + assert metrics.total_memory > 0 + assert metrics.available_cpu >= 0 + assert metrics.available_memory >= 0 + assert metrics.available_cpu <= metrics.total_cpu + assert metrics.available_memory <= metrics.total_memory @pytest.mark.need_ray From 38c971e56a70ec27d4cc631210e34660726178c2 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Fri, 23 Jan 2026 06:56:44 +0000 Subject: [PATCH 12/25] fix comment --- rock/sandbox/service/deployment_service.py | 12 ++++++------ rock/sandbox/service/sandbox_proxy_service.py | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 0c48e59e1..bf6d41a07 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -19,7 +19,7 @@ class AbstractDeploymentService(): @abstractmethod - async def is_deployment_alive(self, sandbox_id) -> bool: + async def is_deployment_alive(self, sandbox_id: str) -> bool: ... @abstractmethod @@ -28,26 +28,26 @@ async def submit(self, config: DeploymentConfig, user_info: dict) -> SandboxInfo ... @abstractmethod - async def get_status(self, *args, **kwargs) -> SandboxInfo: + async def get_status(self, sandbox_id: str) -> SandboxInfo: """Get status of sandbox.""" ... @abstractmethod - async def stop(self, *args, **kwargs): + async def stop(self, sandbox_id: str): """Stop sandbox.""" @abstractmethod - async def get_mount(self, *args, **kwargs): + async def get_mount(self, sandbox_id: str): """Get mount of sandbox.""" ... @abstractmethod - async def get_sandbox_statistics(self, *args, **kwargs): + async def get_sandbox_statistics(self, sandbox_id: str): """Get sandbox statistics.""" ... @abstractmethod - async def commit(self, *args, **kwargs) -> CommandResponse: + async def commit(self, sandbox_id: str, image_tag: str, username: str, password: str) -> CommandResponse: ... @abstractmethod diff --git a/rock/sandbox/service/sandbox_proxy_service.py b/rock/sandbox/service/sandbox_proxy_service.py index 07972ea23..8b896bf0f 100644 --- a/rock/sandbox/service/sandbox_proxy_service.py +++ b/rock/sandbox/service/sandbox_proxy_service.py @@ -88,7 +88,6 @@ async def create_session(self, request: CreateSessionRequest) -> CreateBashSessi sandbox_id = request.sandbox_id await self._update_expire_time(sandbox_id) sandbox_status_dicts = await self.get_service_status(sandbox_id) - print(f"sandbox status dicts: {sandbox_status_dicts}") response = await self._send_request( sandbox_id, sandbox_status_dicts[0], "create_session", None, request.model_dump(), None, "POST" ) From 3469bacfac2a7c8275a70ecb05bbbd1c90547c4a Mon Sep 17 00:00:00 2001 From: daifangwen Date: Fri, 23 Jan 2026 07:42:55 +0000 Subject: [PATCH 13/25] split gem api from deployment service and sink ray operations to ray service --- rock/admin/core/ray_service.py | 26 ++++ rock/sandbox/gem_manager.py | 15 ++- rock/sandbox/service/deployment_service.py | 114 +++--------------- rock/sandbox/service/env_service.py | 74 ++++++++++++ .../service/test_deployment_service.py | 2 +- tests/unit/sandbox/test_sandbox_manager.py | 2 +- 6 files changed, 127 insertions(+), 106 deletions(-) create mode 100644 rock/sandbox/service/env_service.py diff --git a/rock/admin/core/ray_service.py b/rock/admin/core/ray_service.py index 2dee05a36..dfe76f25c 100644 --- a/rock/admin/core/ray_service.py +++ b/rock/admin/core/ray_service.py @@ -1,3 +1,4 @@ +import asyncio import ray import time @@ -34,6 +35,31 @@ def increment_ray_request_count(self): def get_ray_rwlock(self): return self._ray_rwlock + + async def async_ray_get_actor(self, sandbox_id: str): + """Async wrapper for ray.get_actor() using asyncio.to_thread for non-blocking execution.""" + self.increment_ray_request_count() + try: + result = await asyncio.to_thread(ray.get_actor, sandbox_id, namespace=self._config.namespace) + except ValueError as e: + logger.error(f"ray get actor, actor {sandbox_id} not exist", exc_info=e) + raise e + except Exception as e: + logger.error("ray get actor failed", exc_info=e) + error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get actor failed, {str(e)}" + raise Exception(error_msg) + return result + + async def async_ray_get(self, ray_future: ray.ObjectRef): + """Async wrapper for ray.get() using asyncio.to_thread for non-blocking execution.""" + self.increment_ray_request_count() + try: + result = await asyncio.to_thread(ray.get, ray_future, timeout=60) + except Exception as e: + logger.error("ray get failed", exc_info=e) + error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get failed, {str(e)}" + raise Exception(error_msg) + return result def _setup_ray_reconnect_scheduler(self): diff --git a/rock/sandbox/gem_manager.py b/rock/sandbox/gem_manager.py index 6c20b66d1..e060be78e 100644 --- a/rock/sandbox/gem_manager.py +++ b/rock/sandbox/gem_manager.py @@ -17,11 +17,13 @@ from rock.config import RockConfig from rock.deployments.config import DockerDeploymentConfig from rock.sandbox.sandbox_manager import SandboxManager +from rock.sandbox.service.env_service import RayEnvService from rock.utils.providers import RedisProvider from rock.admin.core.ray_service import RayService class GemManager(SandboxManager): + _env_service: RayEnvService def __init__( self, rock_config: RockConfig, @@ -31,6 +33,7 @@ def __init__( enable_runtime_auto_clear: bool = False, ): super().__init__(rock_config, redis_provider, ray_namespace, ray_service, enable_runtime_auto_clear) + self._env_service = RayEnvService(ray_namespace=ray_namespace, ray_service=ray_service) async def env_make(self, env_id: str) -> EnvMakeResponse: config = DockerDeploymentConfig(image=env_vars.ROCK_ENVHUB_DEFAULT_DOCKER_IMAGE) @@ -52,22 +55,22 @@ async def wait_until_alive(sandbox_id: str, interval: float = 1.0): except asyncio.TimeoutError: raise Exception("Sandbox startup timeout after 300s") - make_response = await self._deployment_service.env_make( + make_response = await self._env_service.env_make( EnvMakeRequest( env_id=env_id, sandbox_id=sandbox_start_response.sandbox_id, ) ) return make_response - + async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: - return await self._deployment_service.env_step(request) + return await self._env_service.env_step(request) async def env_reset(self, request: EnvResetRequest) -> EnvResetResponse: - return await self._deployment_service.env_reset(request) + return await self._env_service.env_reset(request) async def env_close(self, request: EnvCloseRequest) -> EnvCloseResponse: - return await self._deployment_service.env_close(request) + return await self._env_service.env_close(request) async def env_list(self, sandbox_id: str) -> EnvListResponse: - return await self._deployment_service.env_list(sandbox_id) + return await self._env_service.env_list(sandbox_id) diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index bf6d41a07..3fc083d5c 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -1,7 +1,5 @@ from abc import abstractmethod -import asyncio -from rock.actions.envs.request import EnvCloseRequest, EnvMakeRequest, EnvResetRequest, EnvStepRequest -from rock.actions.envs.response import EnvCloseResponse, EnvListResponse, EnvMakeResponse, EnvResetResponse, EnvStepResponse + from rock.actions.sandbox.response import CommandResponse, State, SystemResourceMetrics from rock.actions.sandbox.sandbox_info import SandboxInfo from rock.admin.core.ray_service import RayService @@ -50,26 +48,6 @@ async def get_sandbox_statistics(self, sandbox_id: str): async def commit(self, sandbox_id: str, image_tag: str, username: str, password: str) -> CommandResponse: ... - @abstractmethod - async def env_step(self, *args, **kwargs): - ... - - @abstractmethod - async def env_make(self, *args, **kwargs): - ... - - @abstractmethod - async def env_reset(self, *args, **kwargs): - ... - - @abstractmethod - async def env_list(self, *args, **kwargs): - ... - - @abstractmethod - async def env_close(self, *args, **kwargs): - ... - @abstractmethod async def collect_system_resource_metrics(self) -> SystemResourceMetrics: ... @@ -84,36 +62,10 @@ def _get_actor_name(self, sandbox_id): async def is_deployment_alive(self, sandbox_id) -> bool: try: - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) except ValueError: return False - return await self.async_ray_get(actor.is_alive.remote()) - - async def async_ray_get_actor(self, sandbox_id: str): - """Async wrapper for ray.get_actor() using asyncio.to_thread for non-blocking execution.""" - self._ray_service.increment_ray_request_count() - try: - actor_name = self._get_actor_name(sandbox_id) - result = await asyncio.to_thread(ray.get_actor, actor_name, namespace=self._ray_namespace) - except ValueError as e: - logger.error(f"ray get actor, actor {sandbox_id} not exist", exc_info=e) - raise e - except Exception as e: - logger.error("ray get actor failed", exc_info=e) - error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get actor failed, {str(e)}" - raise Exception(error_msg) - return result - - async def async_ray_get(self, ray_future: ray.ObjectRef): - """Async wrapper for ray.get() using asyncio.to_thread for non-blocking execution.""" - self._ray_service.increment_ray_request_count() - try: - result = await asyncio.to_thread(ray.get, ray_future, timeout=60) - except Exception as e: - logger.error("ray get failed", exc_info=e) - error_msg = str(e.args[0]) if len(e.args) > 0 else f"ray get failed, {str(e)}" - raise Exception(error_msg) - return result + return await self._ray_service.async_ray_get(actor.is_alive.remote()) async def submit(self, config: DockerDeploymentConfig, user_info: dict) -> SandboxInfo: async with self._ray_service.get_ray_rwlock().read_lock(): @@ -126,7 +78,7 @@ async def submit(self, config: DockerDeploymentConfig, user_info: dict) -> Sandb sandbox_actor.set_user_id.remote(user_id) sandbox_actor.set_experiment_id.remote(experiment_id) sandbox_actor.set_namespace.remote(namespace) - sandbox_info: SandboxInfo = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) + sandbox_info: SandboxInfo = await self._ray_service.async_ray_get(sandbox_actor.sandbox_info.remote()) sandbox_info["user_id"] = user_id sandbox_info["experiment_id"] = experiment_id sandbox_info["namespace"] = namespace @@ -154,19 +106,19 @@ def _generate_actor_options(self, config: DockerDeploymentConfig) -> dict: async def stop(self, sandbox_id: str): async with self._ray_service.get_ray_rwlock().read_lock(): - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - await self.async_ray_get(actor.stop.remote()) + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + await self._ray_service.async_ray_get(actor.stop.remote()) logger.info(f"run time stop over {sandbox_id}") ray.kill(actor) async def get_status(self, sandbox_id: str) -> SandboxInfo: async with self._ray_service.get_ray_rwlock().read_lock(): - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - sandbox_info: SandboxInfo = await self.async_ray_get(actor.sandbox_info.remote()) - remote_status: ServiceStatus = await self.async_ray_get(actor.get_status.remote()) + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + sandbox_info: SandboxInfo = await self._ray_service.async_ray_get(actor.sandbox_info.remote()) + remote_status: ServiceStatus = await self._ray_service.async_ray_get(actor.get_status.remote()) sandbox_info["phases"] = remote_status.phases sandbox_info["port_mapping"] = remote_status.get_port_mapping() - alive = await self.async_ray_get(actor.is_alive.remote()) + alive = await self._ray_service.async_ray_get(actor.is_alive.remote()) sandbox_info["alive"] = alive.is_alive if alive.is_alive: sandbox_info["state"] = State.RUNNING @@ -174,58 +126,24 @@ async def get_status(self, sandbox_id: str) -> SandboxInfo: async def get_mount(self, sandbox_id: str): with self._ray_service.get_ray_rwlock().read_lock(): - actor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.get_mount.remote()) + actor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.get_mount.remote()) logger.info(f"get_mount: {result}") return result async def get_sandbox_statistics(self, sandbox_id: str): async with self._ray_service.get_ray_rwlock().read_lock(): - actor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.get_sandbox_statistics.remote()) + actor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.get_sandbox_statistics.remote()) logger.info(f"get_sandbox_statistics: {result}") return result async def commit(self, sandbox_id) -> CommandResponse: with self._ray_service.get_ray_rwlock().read_lock(): - actor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.commit.remote()) + actor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.commit.remote()) logger.info(f"commit: {result}") return result - - async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: - sandbox_id = request.sandbox_id - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.env_step.remote(request)) - logger.info(f"env_step: {result}") - return result - - async def env_make(self, request: EnvMakeRequest) -> EnvMakeResponse: - sandbox_id = request.sandbox_id - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.env_make.remote(request)) - logger.info(f"env_make: {result}") - return result - - async def env_reset(self, request: EnvResetRequest) -> EnvResetResponse: - sandbox_id = request.sandbox_id - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.env_reset.remote(request)) - logger.info(f"env_reset: {result}") - return result - - async def env_close(self, request: EnvCloseRequest) -> EnvCloseResponse: - sandbox_id = request.sandbox_id - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.env_close.remote(request)) - logger.info(f"env_close: {result}") - return result - - async def env_list(self, sandbox_id) -> EnvListResponse: - actor: SandboxActor = await self.async_ray_get_actor(sandbox_id) - result = await self.async_ray_get(actor.env_list.remote()) - logger.info(f"env_list: {result}") - return result async def collect_system_resource_metrics(self) -> SystemResourceMetrics: """Collect system resource metrics""" diff --git a/rock/sandbox/service/env_service.py b/rock/sandbox/service/env_service.py new file mode 100644 index 000000000..e87536b09 --- /dev/null +++ b/rock/sandbox/service/env_service.py @@ -0,0 +1,74 @@ +from abc import ABC, abstractmethod + +from rock.actions.envs.request import EnvCloseRequest, EnvMakeRequest, EnvResetRequest, EnvStepRequest +from rock.actions.envs.response import EnvCloseResponse, EnvListResponse, EnvMakeResponse, EnvResetResponse, EnvStepResponse +from rock.admin.core.ray_service import RayService +from rock.logger import init_logger +from rock.sandbox.sandbox_actor import SandboxActor + +logger = init_logger(__name__) + + +class AbstractEnvService(ABC): + @abstractmethod + async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: + ... + + @abstractmethod + async def env_make(self, request: EnvMakeRequest) -> EnvMakeResponse: + ... + + @abstractmethod + async def env_reset(self, request: EnvResetRequest) -> EnvResetResponse: + ... + + @abstractmethod + async def env_close(self, request: EnvCloseRequest) -> EnvCloseResponse: + ... + + @abstractmethod + async def env_list(self, sandbox_id) -> EnvListResponse: + ... + + +class RayEnvService(AbstractEnvService): + def __init__(self, ray_namespace: str, ray_service: RayService): + self._ray_namespace = ray_namespace + self._ray_service = ray_service + + def _get_actor_name(self, sandbox_id): + return sandbox_id + + async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.env_step.remote(request)) + logger.info(f"env_step: {result}") + return result + + async def env_make(self, request: EnvMakeRequest) -> EnvMakeResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.env_make.remote(request)) + logger.info(f"env_make: {result}") + return result + + async def env_reset(self, request: EnvResetRequest) -> EnvResetResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.env_reset.remote(request)) + logger.info(f"env_reset: {result}") + return result + + async def env_close(self, request: EnvCloseRequest) -> EnvCloseResponse: + sandbox_id = request.sandbox_id + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.env_close.remote(request)) + logger.info(f"env_close: {result}") + return result + + async def env_list(self, sandbox_id) -> EnvListResponse: + actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) + result = await self._ray_service.async_ray_get(actor.env_list.remote()) + logger.info(f"env_list: {result}") + return result \ No newline at end of file diff --git a/tests/unit/sandbox/service/test_deployment_service.py b/tests/unit/sandbox/service/test_deployment_service.py index 77b75fe14..690e9000d 100644 --- a/tests/unit/sandbox/service/test_deployment_service.py +++ b/tests/unit/sandbox/service/test_deployment_service.py @@ -7,7 +7,7 @@ async def test_get_actor_not_exist_raises_value_error(ray_deployment_service): sandbox_id = "unknown" with pytest.raises(Exception) as exc_info: - await ray_deployment_service.async_ray_get_actor(sandbox_id) + await ray_deployment_service._ray_service.async_ray_get_actor(sandbox_id) assert exc_info.type == ValueError diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index df0821d69..0c6e1ad8c 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -67,7 +67,7 @@ async def test_ray_actor_is_alive(sandbox_manager): assert await wait_sandbox_instance_alive(sandbox_manager, response.sandbox_id) - sandbox_actor = await sandbox_manager._deployment_service.async_ray_get_actor(response.sandbox_id) + sandbox_actor = await sandbox_manager._deployment_service._ray_service.async_ray_get_actor(response.sandbox_id) ray.kill(sandbox_actor) assert not await sandbox_manager._deployment_service.is_deployment_alive(response.sandbox_id) From bd9c2ad425a6458d115b89d0f6a3f76f0f18f806 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Mon, 26 Jan 2026 08:03:39 +0000 Subject: [PATCH 14/25] fix comment --- rock/actions/sandbox/sandbox_info.py | 1 - rock/sandbox/sandbox_manager.py | 2 +- rock/sandbox/service/deployment_service.py | 5 ++--- tests/unit/sandbox/test_sandbox_manager.py | 8 ++++---- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/rock/actions/sandbox/sandbox_info.py b/rock/actions/sandbox/sandbox_info.py index dd0c96928..63bcdd9c0 100644 --- a/rock/actions/sandbox/sandbox_info.py +++ b/rock/actions/sandbox/sandbox_info.py @@ -20,7 +20,6 @@ class SandboxInfo(TypedDict, total=False): create_user_gray_flag: bool cpus: float memory: str - alive: bool class SandboxListItem(SandboxInfo): diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 777f805a9..37cbf4d47 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -151,7 +151,7 @@ async def get_status(self, sandbox_id) -> SandboxStatusResponse: port_mapping=sandbox_info.get("port_mapping"), host_name=sandbox_info.get("host_name"), host_ip=sandbox_info.get("host_ip"), - is_alive=sandbox_info.get("alive"), + is_alive=sandbox_info.get("state") == State.RUNNING, image=sandbox_info.get("image"), swe_rex_version=swe_version, gateway_version=gateway_version, diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 3fc083d5c..1fbcf1439 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -17,7 +17,7 @@ class AbstractDeploymentService(): @abstractmethod - async def is_deployment_alive(self, sandbox_id: str) -> bool: + async def is_alive(self, sandbox_id: str) -> bool: ... @abstractmethod @@ -60,7 +60,7 @@ def __init__(self, ray_namespace: str, ray_service: RayService): def _get_actor_name(self, sandbox_id): return f"sandbox-{sandbox_id}" - async def is_deployment_alive(self, sandbox_id) -> bool: + async def is_alive(self, sandbox_id) -> bool: try: actor: SandboxActor = await self._ray_service.async_ray_get_actor(self._get_actor_name(sandbox_id)) except ValueError: @@ -119,7 +119,6 @@ async def get_status(self, sandbox_id: str) -> SandboxInfo: sandbox_info["phases"] = remote_status.phases sandbox_info["port_mapping"] = remote_status.get_port_mapping() alive = await self._ray_service.async_ray_get(actor.is_alive.remote()) - sandbox_info["alive"] = alive.is_alive if alive.is_alive: sandbox_info["state"] = State.RUNNING return sandbox_info diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index 0c6e1ad8c..a444eb0be 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -26,7 +26,7 @@ async def test_async_sandbox_start(sandbox_manager: SandboxManager): assert sandbox_id is not None assert await wait_sandbox_instance_alive(sandbox_manager, sandbox_id) - assert await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) + assert await sandbox_manager._deployment_service.is_alive(sandbox_id) sandbox_status = await sandbox_manager.get_status(sandbox_id) assert sandbox_status.user_id == "default" @@ -70,7 +70,7 @@ async def test_ray_actor_is_alive(sandbox_manager): sandbox_actor = await sandbox_manager._deployment_service._ray_service.async_ray_get_actor(response.sandbox_id) ray.kill(sandbox_actor) - assert not await sandbox_manager._deployment_service.is_deployment_alive(response.sandbox_id) + assert not await sandbox_manager._deployment_service.is_alive(response.sandbox_id) @pytest.mark.need_ray @@ -82,7 +82,7 @@ async def test_user_info_set_success(sandbox_manager): assert await wait_sandbox_instance_alive(sandbox_manager, sandbox_id) - is_alive_response = await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) + is_alive_response = await sandbox_manager._deployment_service.is_alive(sandbox_id) assert is_alive_response sandbox_status = await sandbox_manager.get_status(sandbox_id) @@ -163,7 +163,7 @@ async def test_sandbox_start_with_sandbox_id(sandbox_manager): async def wait_sandbox_instance_alive(sandbox_manager: SandboxManager, sandbox_id: str) -> bool: cnt = 0 while True: - is_alive_response = await sandbox_manager._deployment_service.is_deployment_alive(sandbox_id) + is_alive_response = await sandbox_manager._deployment_service.is_alive(sandbox_id) if is_alive_response: return True time.sleep(1) From e126c133bcb2ac58af6dfb4b722c716878cb4ac3 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Mon, 26 Jan 2026 11:13:26 +0000 Subject: [PATCH 15/25] refactor: consolidate status methods by merging get_status_v2 into get_status --- rock/sandbox/sandbox_manager.py | 126 +++++++++++++++++--------------- 1 file changed, 68 insertions(+), 58 deletions(-) diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 37cbf4d47..0e6c6b991 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -133,25 +133,72 @@ async def _clear_redis_keys(self, sandbox_id): logger.info(f"sandbox {sandbox_id} deleted from redis") @monitor_sandbox_operation() - async def get_status(self, sandbox_id) -> SandboxStatusResponse: - deployment_info: SandboxInfo = await self._deployment_service.get_status(sandbox_id) - sandbox_info: SandboxInfo = None + async def get_status(self, sandbox_id, use_proxy: bool = False) -> SandboxStatusResponse: + """ + Get sandbox status with optional remote health check. + + Note: The use_proxy parameter is deprecated and will be removed in a future version. + + Args: + sandbox_id: The sandbox identifier + use_proxy: If True, performs remote status check and alive verification (default: False) + + Returns: + SandboxStatusResponse with complete status information + """ + # 1. Get sandbox_info (unified exception handling)ß + sandbox_info = await self._get_sandbox_info(sandbox_id) + host_ip = sandbox_info.get("host_ip") + + # 2. Determine status retrieval strategy + if use_proxy and host_ip: + # Use remote status check with parallel operations + _, remote_status = await asyncio.gather( + self._update_expire_time(sandbox_id), + self.get_remote_status(sandbox_id, host_ip), + ) + + # Update sandbox_info with remote status + sandbox_info.update(remote_status.to_dict()) + + # Check alive status + is_alive = await self._check_alive_status(sandbox_id, host_ip, remote_status) + if is_alive: + sandbox_info["state"] = State.RUNNING + + status = remote_status.phases + port_mapping = remote_status.get_port_mapping() + else: + # Fallback to deployment service status + deployment_info = await self._deployment_service.get_status(sandbox_id) + + # Merge deployment info into sandbox_info + if self._redis_provider: + sandbox_info = await self.build_sandbox_info_from_redis(sandbox_id, deployment_info) + else: + sandbox_info.update(deployment_info) + + # Update expire time + await self._update_expire_time(sandbox_id) + + status = sandbox_info.get("phases") + port_mapping = sandbox_info.get("port_mapping") + is_alive = sandbox_info.get("state") == State.RUNNING + + # 3. Persist to Redis if available if self._redis_provider: - sandbox_info = await self.build_sandbox_info_from_redis(sandbox_id, deployment_info) await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) - await self._update_expire_time(sandbox_id) - logger.info(f"sandbox {sandbox_id} status is {sandbox_info}, write to redis") - else: - sandbox_info = deployment_info - + logger.info(f"sandbox {sandbox_id} status updated, write to redis") + + # 4. Build and return unified response return SandboxStatusResponse( sandbox_id=sandbox_id, - status=sandbox_info.get("phases"), + status=status, + port_mapping=port_mapping, state=sandbox_info.get("state"), - port_mapping=sandbox_info.get("port_mapping"), host_name=sandbox_info.get("host_name"), - host_ip=sandbox_info.get("host_ip"), - is_alive=sandbox_info.get("state") == State.RUNNING, + host_ip=host_ip, + is_alive=is_alive, image=sandbox_info.get("image"), swe_rex_version=swe_version, gateway_version=gateway_version, @@ -163,14 +210,12 @@ async def get_status(self, sandbox_id) -> SandboxStatusResponse: ) async def _get_sandbox_info(self, sandbox_id: str) -> SandboxInfo: - """Get sandbox info, prioritize Redis, fallback to Ray Actor""" + """Get sandbox info, prioritize Redis, fallback to deployment service""" if self._redis_provider: sandbox_info = await build_sandbox_from_redis(self._redis_provider, sandbox_id) else: - sandbox_actor = await self.async_ray_get_actor(sandbox_id) - if sandbox_actor is None: - raise Exception(f"sandbox {sandbox_id} not found to get status") - sandbox_info = await self.async_ray_get(sandbox_actor.sandbox_info.remote()) + # Fallback to deployment service (Ray calls are encapsulated in deployment_service) + sandbox_info = await self._deployment_service.get_status(sandbox_id) if sandbox_info is None: raise Exception(f"sandbox {sandbox_id} not found to get status") @@ -193,47 +238,12 @@ async def _check_alive_status( except Exception: return False - @monitor_sandbox_operation() async def get_status_v2(self, sandbox_id) -> SandboxStatusResponse: - # 1. Get sandbox_info (unified exception handling) - sandbox_info = await self._get_sandbox_info(sandbox_id) - - # 2. Parallel execution: update expire time & get remote status - host_ip = sandbox_info.get("host_ip") - _, remote_status = await asyncio.gather( - self._update_expire_time(sandbox_id), - self.get_remote_status(sandbox_id, host_ip), - ) - - # 3. Update sandbox_info and check alive status - sandbox_info.update(remote_status.to_dict()) - is_alive = await self._check_alive_status(sandbox_id, host_ip, remote_status) - if is_alive: - sandbox_info["state"] = State.RUNNING - - # 4. Persist to Redis if Redis exists - if self._redis_provider: - await self._redis_provider.json_set(alive_sandbox_key(sandbox_id), "$", sandbox_info) - logger.info(f"sandbox {sandbox_id} status is {remote_status}, write to redis") - - # 5. Build and return response - return SandboxStatusResponse( - sandbox_id=sandbox_id, - status=remote_status.phases, - port_mapping=remote_status.get_port_mapping(), - state=sandbox_info.get("state"), - host_name=sandbox_info.get("host_name"), - host_ip=sandbox_info.get("host_ip"), - is_alive=is_alive, - image=sandbox_info.get("image"), - swe_rex_version=swe_version, - gateway_version=gateway_version, - user_id=sandbox_info.get("user_id"), - experiment_id=sandbox_info.get("experiment_id"), - namespace=sandbox_info.get("namespace"), - cpus=sandbox_info.get("cpus"), - memory=sandbox_info.get("memory"), - ) + """ + Deprecated: Use get_status(sandbox_id, use_proxy=True) instead. + This method is kept for backward compatibility. + """ + return await self.get_status(sandbox_id, use_proxy=True) async def get_remote_status(self, sandbox_id: str, host_ip: str) -> ServiceStatus: service_status_path = PersistedServiceStatus.gen_service_status_path(sandbox_id) From 29e31ec86af05f3666cdde2918dd0de8feaf39d1 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Mon, 26 Jan 2026 12:28:37 +0000 Subject: [PATCH 16/25] add test case --- tests/unit/sandbox/test_sandbox_manager.py | 170 +++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index a444eb0be..b0bfcc31e 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -8,6 +8,7 @@ from rock.actions import SandboxStatusResponse from rock.actions.sandbox.response import State +from rock.config import RockConfig from rock.deployments.config import DockerDeploymentConfig, RayDeploymentConfig from rock.deployments.constants import Port from rock.deployments.status import ServiceStatus @@ -170,3 +171,172 @@ async def wait_sandbox_instance_alive(sandbox_manager: SandboxManager, sandbox_i cnt += 1 if cnt > 60: raise Exception("sandbox not alive") + + +async def wait_for_rocklet_service_ready(sandbox_manager: SandboxManager, sandbox_id: str, timeout: int = 120): + """Wait for rocklet HTTP service to be ready in container + + Args: + sandbox_manager: SandboxManager instance + sandbox_id: Sandbox ID + timeout: Maximum wait time in seconds + + Raises: + Exception: If service is not ready within timeout + """ + from rock.deployments.constants import Port + from rock.utils import HttpUtils, EAGLE_EYE_TRACE_ID, trace_id_ctx_var + + start_time = time.time() + while time.time() - start_time < timeout: + try: + # Get sandbox info to get host_ip and port + status = await sandbox_manager.get_status(sandbox_id, use_proxy=False) + if not status.is_alive or not status.host_ip: + await asyncio.sleep(2) + continue + + # Try to connect to rocklet service + port = status.port_mapping.get(Port.PROXY) + if not port: + await asyncio.sleep(2) + continue + + # Test if rocklet service is responding + try: + await HttpUtils.get( + url=f"http://{status.host_ip}:{port}/", + headers={ + "sandbox_id": sandbox_id, + EAGLE_EYE_TRACE_ID: trace_id_ctx_var.get(), + }, + read_timeout=5, + ) + logger.info(f"Rocklet service is ready for sandbox {sandbox_id}") + return + except Exception: + # Service not ready yet, continue waiting + await asyncio.sleep(2) + continue + except Exception as e: + logger.debug(f"Waiting for rocklet service: {e}") + await asyncio.sleep(2) + + raise Exception(f"Rocklet service not ready within {timeout}s for sandbox {sandbox_id}") + + +async def _test_get_status_with_redis(sandbox_manager: SandboxManager, use_proxy: bool): + """Helper function to test get_status with Redis""" + from rock.admin.core.redis_key import alive_sandbox_key + + # Submit a sandbox + response = await sandbox_manager.submit(DockerDeploymentConfig(image="python:3.11")) + sandbox_id = response.sandbox_id + + try: + # Wait for sandbox to be alive + await check_sandbox_status_until_alive(sandbox_manager, sandbox_id) + + # If using proxy, wait for rocklet HTTP service to be ready + # if use_proxy: + # await wait_for_rocklet_service_ready(sandbox_manager, sandbox_id) + + # Test: get_status with Redis + status_response = await sandbox_manager.get_status(sandbox_id, use_proxy=use_proxy) + + # Common assertions + assert status_response.sandbox_id == sandbox_id + assert status_response.host_ip is not None + assert status_response.host_name is not None + assert status_response.is_alive is True + assert status_response.state == State.RUNNING + assert len(status_response.port_mapping) > 0 + assert status_response.image == "python:3.11" + + # Verify Redis was used/updated + redis_data = await sandbox_manager._redis_provider.json_get(alive_sandbox_key(sandbox_id), "$") + assert redis_data is not None + assert len(redis_data) > 0 + + # Additional assertions for proxy mode + if use_proxy: + # Verify remote status was fetched (phases should be populated) + assert status_response.status is not None + assert "docker_run" in status_response.status + finally: + # Cleanup + await sandbox_manager.stop(sandbox_id) + + +@pytest.mark.need_ray +@pytest.mark.asyncio +async def test_get_status_with_redis_without_proxy(sandbox_manager: SandboxManager): + """Test get_status: with Redis, without proxy (use_proxy=False)""" + await _test_get_status_with_redis(sandbox_manager, use_proxy=False) + + +@pytest.mark.skip(reason="Skip this test after proxy port is fixed") +@pytest.mark.need_ray +@pytest.mark.asyncio +async def test_get_status_with_redis_with_proxy(sandbox_manager: SandboxManager): + """Test get_status: with Redis, with proxy (use_proxy=True)""" + await _test_get_status_with_redis(sandbox_manager, use_proxy=True) + +async def _test_get_status_without_redis(rock_config: RockConfig, ray_service, use_proxy: bool): + """Helper function to test get_status without Redis""" + # Create sandbox_manager without Redis + sandbox_manager_no_redis = SandboxManager( + rock_config, + redis_provider=None, # No Redis + ray_namespace=rock_config.ray.namespace, + ray_service=ray_service, + enable_runtime_auto_clear=False, + ) + + # Submit a sandbox + response = await sandbox_manager_no_redis.submit(DockerDeploymentConfig(image="python:3.11")) + sandbox_id = response.sandbox_id + + try: + # Wait for sandbox to be alive + await check_sandbox_status_until_alive(sandbox_manager_no_redis, sandbox_id) + + # If using proxy, wait for rocklet HTTP service to be ready + if use_proxy: + await wait_for_rocklet_service_ready(sandbox_manager_no_redis, sandbox_id) + + # Test: get_status without Redis + status_response = await sandbox_manager_no_redis.get_status(sandbox_id, use_proxy=use_proxy) + + # Common assertions + assert status_response.sandbox_id == sandbox_id + assert status_response.host_ip is not None + assert status_response.host_name is not None + assert status_response.is_alive is True + assert status_response.state == State.RUNNING + assert len(status_response.port_mapping) > 0 + assert status_response.image == "python:3.11" + assert status_response.status is not None + + # Additional assertions for proxy mode + if use_proxy: + # Verify remote status was fetched (phases should be populated) + assert "docker_run" in status_response.status + finally: + # Cleanup + await sandbox_manager_no_redis.stop(sandbox_id) + + +@pytest.mark.need_ray +@pytest.mark.asyncio +async def test_get_status_without_redis_without_proxy(rock_config: RockConfig, ray_init_shutdown, ray_service): + """Test get_status: without Redis, without proxy (use_proxy=False)""" + await _test_get_status_without_redis(rock_config, ray_service, use_proxy=False) + + +@pytest.mark.skip(reason="Skip this test after proxy port is fixed") +@pytest.mark.need_ray +@pytest.mark.asyncio +async def test_get_status_without_redis_with_proxy(rock_config: RockConfig, ray_init_shutdown, ray_service): + """Test get_status: without Redis, with proxy (use_proxy=True)""" + await _test_get_status_without_redis(rock_config, ray_service, use_proxy=True) From 14854904f9d8a9a4a8e7e7211613905fcb96a91f Mon Sep 17 00:00:00 2001 From: daifangwen Date: Mon, 26 Jan 2026 13:49:56 +0000 Subject: [PATCH 17/25] fix test case --- tests/unit/sandbox/test_sandbox_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index b0bfcc31e..6dbda95d6 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -68,7 +68,8 @@ async def test_ray_actor_is_alive(sandbox_manager): assert await wait_sandbox_instance_alive(sandbox_manager, response.sandbox_id) - sandbox_actor = await sandbox_manager._deployment_service._ray_service.async_ray_get_actor(response.sandbox_id) + actor_name = sandbox_manager._deployment_service._get_actor_name(response.sandbox_id) + sandbox_actor = await sandbox_manager._deployment_service._ray_service.async_ray_get_actor(actor_name) ray.kill(sandbox_actor) assert not await sandbox_manager._deployment_service.is_alive(response.sandbox_id) @@ -275,7 +276,7 @@ async def test_get_status_with_redis_without_proxy(sandbox_manager: SandboxManag await _test_get_status_with_redis(sandbox_manager, use_proxy=False) -@pytest.mark.skip(reason="Skip this test after proxy port is fixed") +# @pytest.mark.skip(reason="Skip this test after proxy port is fixed") @pytest.mark.need_ray @pytest.mark.asyncio async def test_get_status_with_redis_with_proxy(sandbox_manager: SandboxManager): From 0706157761ca1472b0f615b9aa628dcb8451c84c Mon Sep 17 00:00:00 2001 From: daifangwen Date: Mon, 26 Jan 2026 14:43:31 +0000 Subject: [PATCH 18/25] fix test --- tests/unit/sandbox/test_sandbox_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index 6dbda95d6..4ff3b43f6 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -276,7 +276,7 @@ async def test_get_status_with_redis_without_proxy(sandbox_manager: SandboxManag await _test_get_status_with_redis(sandbox_manager, use_proxy=False) -# @pytest.mark.skip(reason="Skip this test after proxy port is fixed") +@pytest.mark.skip(reason="Skip this test after proxy port is fixed") @pytest.mark.need_ray @pytest.mark.asyncio async def test_get_status_with_redis_with_proxy(sandbox_manager: SandboxManager): From 56e6e359a77e0c2d14884580a88acb843d423495 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Mon, 26 Jan 2026 16:35:32 +0000 Subject: [PATCH 19/25] fix get actor name in env service --- rock/sandbox/service/env_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rock/sandbox/service/env_service.py b/rock/sandbox/service/env_service.py index e87536b09..f881ac903 100644 --- a/rock/sandbox/service/env_service.py +++ b/rock/sandbox/service/env_service.py @@ -37,7 +37,7 @@ def __init__(self, ray_namespace: str, ray_service: RayService): self._ray_service = ray_service def _get_actor_name(self, sandbox_id): - return sandbox_id + return f"sandbox-{sandbox_id}" async def env_step(self, request: EnvStepRequest) -> EnvStepResponse: sandbox_id = request.sandbox_id From 07efee97517a754f91f91efea0c4e1323798d72d Mon Sep 17 00:00:00 2001 From: daifangwen Date: Tue, 27 Jan 2026 02:40:17 +0000 Subject: [PATCH 20/25] modify fakeredis dependency group --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a6399d464..9c89a2633 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,8 @@ admin = [ "boto3", "ray[default]==2.43.0", "pip", - "cryptography==39.0.1" + "cryptography==39.0.1", + "fakeredis[json]", ] rocklet = [ @@ -110,7 +111,6 @@ test = [ "pytest-trio", "pytest-twisted", "pytest-env", - "fakeredis[json]", ] [tool.setuptools.packages.find] From 998a3d36f8f8dd7b5667ccd010e39de36c96ab1c Mon Sep 17 00:00:00 2001 From: daifangwen Date: Tue, 27 Jan 2026 03:01:33 +0000 Subject: [PATCH 21/25] optimize get status judgement and param --- rock/sandbox/sandbox_manager.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 0e6c6b991..7f9a81883 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -133,15 +133,15 @@ async def _clear_redis_keys(self, sandbox_id): logger.info(f"sandbox {sandbox_id} deleted from redis") @monitor_sandbox_operation() - async def get_status(self, sandbox_id, use_proxy: bool = False) -> SandboxStatusResponse: + async def get_status(self, sandbox_id, use_rocklet: bool = False) -> SandboxStatusResponse: """ Get sandbox status with optional remote health check. - Note: The use_proxy parameter is deprecated and will be removed in a future version. + Note: The use_rocklet parameter is deprecated and will be removed in a future version. Args: sandbox_id: The sandbox identifier - use_proxy: If True, performs remote status check and alive verification (default: False) + use_rocklet: If True, performs remote status check and alive verification (default: False) Returns: SandboxStatusResponse with complete status information @@ -151,7 +151,7 @@ async def get_status(self, sandbox_id, use_proxy: bool = False) -> SandboxStatus host_ip = sandbox_info.get("host_ip") # 2. Determine status retrieval strategy - if use_proxy and host_ip: + if use_rocklet and self._redis_provider: # Use remote status check with parallel operations _, remote_status = await asyncio.gather( self._update_expire_time(sandbox_id), @@ -240,10 +240,10 @@ async def _check_alive_status( async def get_status_v2(self, sandbox_id) -> SandboxStatusResponse: """ - Deprecated: Use get_status(sandbox_id, use_proxy=True) instead. + Deprecated: Use get_status(sandbox_id, use_rocklet=True) instead. This method is kept for backward compatibility. """ - return await self.get_status(sandbox_id, use_proxy=True) + return await self.get_status(sandbox_id, use_rocklet=True) async def get_remote_status(self, sandbox_id: str, host_ip: str) -> ServiceStatus: service_status_path = PersistedServiceStatus.gen_service_status_path(sandbox_id) From ec035dd7c7aaa4e14cd0bf2805804429b92c97f4 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Tue, 27 Jan 2026 03:15:24 +0000 Subject: [PATCH 22/25] fix test case: correct parameter name in sandbox test --- tests/unit/sandbox/test_sandbox_manager.py | 54 +++++++++++----------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/unit/sandbox/test_sandbox_manager.py b/tests/unit/sandbox/test_sandbox_manager.py index 4ff3b43f6..6a4c5a5ca 100644 --- a/tests/unit/sandbox/test_sandbox_manager.py +++ b/tests/unit/sandbox/test_sandbox_manager.py @@ -192,7 +192,7 @@ async def wait_for_rocklet_service_ready(sandbox_manager: SandboxManager, sandbo while time.time() - start_time < timeout: try: # Get sandbox info to get host_ip and port - status = await sandbox_manager.get_status(sandbox_id, use_proxy=False) + status = await sandbox_manager.get_status(sandbox_id, use_rocklet=False) if not status.is_alive or not status.host_ip: await asyncio.sleep(2) continue @@ -226,7 +226,7 @@ async def wait_for_rocklet_service_ready(sandbox_manager: SandboxManager, sandbo raise Exception(f"Rocklet service not ready within {timeout}s for sandbox {sandbox_id}") -async def _test_get_status_with_redis(sandbox_manager: SandboxManager, use_proxy: bool): +async def _test_get_status_with_redis(sandbox_manager: SandboxManager, use_rocklet: bool): """Helper function to test get_status with Redis""" from rock.admin.core.redis_key import alive_sandbox_key @@ -238,12 +238,12 @@ async def _test_get_status_with_redis(sandbox_manager: SandboxManager, use_proxy # Wait for sandbox to be alive await check_sandbox_status_until_alive(sandbox_manager, sandbox_id) - # If using proxy, wait for rocklet HTTP service to be ready - # if use_proxy: + # If using rocklet, wait for rocklet HTTP service to be ready + # if use_rocklet: # await wait_for_rocklet_service_ready(sandbox_manager, sandbox_id) # Test: get_status with Redis - status_response = await sandbox_manager.get_status(sandbox_id, use_proxy=use_proxy) + status_response = await sandbox_manager.get_status(sandbox_id, use_rocklet=use_rocklet) # Common assertions assert status_response.sandbox_id == sandbox_id @@ -259,8 +259,8 @@ async def _test_get_status_with_redis(sandbox_manager: SandboxManager, use_proxy assert redis_data is not None assert len(redis_data) > 0 - # Additional assertions for proxy mode - if use_proxy: + # Additional assertions for rocklet mode + if use_rocklet: # Verify remote status was fetched (phases should be populated) assert status_response.status is not None assert "docker_run" in status_response.status @@ -271,19 +271,19 @@ async def _test_get_status_with_redis(sandbox_manager: SandboxManager, use_proxy @pytest.mark.need_ray @pytest.mark.asyncio -async def test_get_status_with_redis_without_proxy(sandbox_manager: SandboxManager): - """Test get_status: with Redis, without proxy (use_proxy=False)""" - await _test_get_status_with_redis(sandbox_manager, use_proxy=False) +async def test_get_status_with_redis_without_rocklet(sandbox_manager: SandboxManager): + """Test get_status: with Redis, without rocklet (use_rocklet=False)""" + await _test_get_status_with_redis(sandbox_manager, use_rocklet=False) -@pytest.mark.skip(reason="Skip this test after proxy port is fixed") +@pytest.mark.skip(reason="Skip this test after rocklet port is fixed") @pytest.mark.need_ray @pytest.mark.asyncio -async def test_get_status_with_redis_with_proxy(sandbox_manager: SandboxManager): - """Test get_status: with Redis, with proxy (use_proxy=True)""" - await _test_get_status_with_redis(sandbox_manager, use_proxy=True) +async def test_get_status_with_redis_with_rocklet(sandbox_manager: SandboxManager): + """Test get_status: with Redis, with rocklet (use_rocklet=True)""" + await _test_get_status_with_redis(sandbox_manager, use_rocklet=True) -async def _test_get_status_without_redis(rock_config: RockConfig, ray_service, use_proxy: bool): +async def _test_get_status_without_redis(rock_config: RockConfig, ray_service, use_rocklet: bool): """Helper function to test get_status without Redis""" # Create sandbox_manager without Redis sandbox_manager_no_redis = SandboxManager( @@ -302,12 +302,12 @@ async def _test_get_status_without_redis(rock_config: RockConfig, ray_service, u # Wait for sandbox to be alive await check_sandbox_status_until_alive(sandbox_manager_no_redis, sandbox_id) - # If using proxy, wait for rocklet HTTP service to be ready - if use_proxy: + # If using rocklet, wait for rocklet HTTP service to be ready + if use_rocklet: await wait_for_rocklet_service_ready(sandbox_manager_no_redis, sandbox_id) # Test: get_status without Redis - status_response = await sandbox_manager_no_redis.get_status(sandbox_id, use_proxy=use_proxy) + status_response = await sandbox_manager_no_redis.get_status(sandbox_id, use_rocklet=use_rocklet) # Common assertions assert status_response.sandbox_id == sandbox_id @@ -319,8 +319,8 @@ async def _test_get_status_without_redis(rock_config: RockConfig, ray_service, u assert status_response.image == "python:3.11" assert status_response.status is not None - # Additional assertions for proxy mode - if use_proxy: + # Additional assertions for rocklet mode + if use_rocklet: # Verify remote status was fetched (phases should be populated) assert "docker_run" in status_response.status finally: @@ -330,14 +330,14 @@ async def _test_get_status_without_redis(rock_config: RockConfig, ray_service, u @pytest.mark.need_ray @pytest.mark.asyncio -async def test_get_status_without_redis_without_proxy(rock_config: RockConfig, ray_init_shutdown, ray_service): - """Test get_status: without Redis, without proxy (use_proxy=False)""" - await _test_get_status_without_redis(rock_config, ray_service, use_proxy=False) +async def test_get_status_without_redis_without_rocklet(rock_config: RockConfig, ray_init_shutdown, ray_service): + """Test get_status: without Redis, without rocklet (use_rocklet=False)""" + await _test_get_status_without_redis(rock_config, ray_service, use_rocklet=False) -@pytest.mark.skip(reason="Skip this test after proxy port is fixed") +@pytest.mark.skip(reason="Skip this test after rocklet port is fixed") @pytest.mark.need_ray @pytest.mark.asyncio -async def test_get_status_without_redis_with_proxy(rock_config: RockConfig, ray_init_shutdown, ray_service): - """Test get_status: without Redis, with proxy (use_proxy=True)""" - await _test_get_status_without_redis(rock_config, ray_service, use_proxy=True) +async def test_get_status_without_redis_with_rocklet(rock_config: RockConfig, ray_init_shutdown, ray_service): + """Test get_status: without Redis, with rocklet (use_rocklet=True)""" + await _test_get_status_without_redis(rock_config, ray_service, use_rocklet=True) From 440e6c2e65cae801dedb737dd26811335ef32336 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Tue, 27 Jan 2026 04:17:08 +0000 Subject: [PATCH 23/25] optimize deployment service inherit --- rock/sandbox/service/deployment_service.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index 1fbcf1439..bcb8fa3b3 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -1,4 +1,4 @@ -from abc import abstractmethod +from abc import ABC, abstractmethod from rock.actions.sandbox.response import CommandResponse, State, SystemResourceMetrics from rock.actions.sandbox.sandbox_info import SandboxInfo @@ -15,7 +15,9 @@ logger = init_logger(__name__) -class AbstractDeploymentService(): +class AbstractDeploymentService(ABC): + """Abstract base class for deployment services implementing IDeploymentService.""" + @abstractmethod async def is_alive(self, sandbox_id: str) -> bool: ... @@ -34,25 +36,23 @@ async def get_status(self, sandbox_id: str) -> SandboxInfo: async def stop(self, sandbox_id: str): """Stop sandbox.""" - @abstractmethod async def get_mount(self, sandbox_id: str): """Get mount of sandbox.""" - ... + return None - @abstractmethod async def get_sandbox_statistics(self, sandbox_id: str): """Get sandbox statistics.""" - ... + return None - @abstractmethod async def commit(self, sandbox_id: str, image_tag: str, username: str, password: str) -> CommandResponse: - ... + """Commit sandbox to image.""" + return None - @abstractmethod async def collect_system_resource_metrics(self) -> SystemResourceMetrics: - ... + """Collect system resource metrics.""" + return None -class RayDeploymentService(): +class RayDeploymentService(AbstractDeploymentService): def __init__(self, ray_namespace: str, ray_service: RayService): self._ray_namespace = ray_namespace self._ray_service = ray_service From 28beced3be8c17ad4ae2faa7ea934beae113fd0e Mon Sep 17 00:00:00 2001 From: daifangwen Date: Tue, 27 Jan 2026 04:38:24 +0000 Subject: [PATCH 24/25] optimize abstract deployment service default return --- rock/sandbox/service/deployment_service.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment_service.py index bcb8fa3b3..7bf99a742 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment_service.py @@ -38,19 +38,19 @@ async def stop(self, sandbox_id: str): async def get_mount(self, sandbox_id: str): """Get mount of sandbox.""" - return None + raise NotImplementedError async def get_sandbox_statistics(self, sandbox_id: str): """Get sandbox statistics.""" - return None + raise NotImplementedError async def commit(self, sandbox_id: str, image_tag: str, username: str, password: str) -> CommandResponse: """Commit sandbox to image.""" - return None + raise NotImplementedError async def collect_system_resource_metrics(self) -> SystemResourceMetrics: """Collect system resource metrics.""" - return None + raise NotImplementedError class RayDeploymentService(AbstractDeploymentService): def __init__(self, ray_namespace: str, ray_service: RayService): From 8722ebd04d97be405739a0bddf6c1feb4dca8e11 Mon Sep 17 00:00:00 2001 From: daifangwen Date: Tue, 27 Jan 2026 04:54:27 +0000 Subject: [PATCH 25/25] refactor: reorganize deployment service imports and structure --- rock/sandbox/sandbox_manager.py | 3 +- rock/sandbox/service/deployment/abstract.py | 45 +++++++++++++++++++ .../ray.py} | 42 +---------------- tests/unit/conftest.py | 2 +- 4 files changed, 50 insertions(+), 42 deletions(-) create mode 100644 rock/sandbox/service/deployment/abstract.py rename rock/sandbox/service/{deployment_service.py => deployment/ray.py} (82%) diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 7f9a81883..d84688a00 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -32,7 +32,8 @@ from rock.logger import init_logger from rock.sandbox.base_manager import BaseManager -from rock.sandbox.service.deployment_service import AbstractDeploymentService, RayDeploymentService +from rock.sandbox.service.deployment.abstract import AbstractDeploymentService +from rock.sandbox.service.deployment.ray import RayDeploymentService from rock.sandbox.service.sandbox_proxy_service import SandboxProxyService from rock.sdk.common.exceptions import BadRequestRockError from rock.utils import ( diff --git a/rock/sandbox/service/deployment/abstract.py b/rock/sandbox/service/deployment/abstract.py new file mode 100644 index 000000000..7f7f441e8 --- /dev/null +++ b/rock/sandbox/service/deployment/abstract.py @@ -0,0 +1,45 @@ +from abc import ABC, abstractmethod + +from rock.actions.sandbox.response import CommandResponse, SystemResourceMetrics +from rock.actions.sandbox.sandbox_info import SandboxInfo +from rock.deployments.config import DeploymentConfig +from rock.logger import init_logger + +logger = init_logger(__name__) + +class AbstractDeploymentService(ABC): + """Abstract base class for deployment services implementing IDeploymentService.""" + + @abstractmethod + async def is_alive(self, sandbox_id: str) -> bool: + ... + + @abstractmethod + async def submit(self, config: DeploymentConfig, user_info: dict) -> SandboxInfo: + """Get status of sandbox.""" + ... + + @abstractmethod + async def get_status(self, sandbox_id: str) -> SandboxInfo: + """Get status of sandbox.""" + ... + + @abstractmethod + async def stop(self, sandbox_id: str): + """Stop sandbox.""" + + async def get_mount(self, sandbox_id: str): + """Get mount of sandbox.""" + raise NotImplementedError + + async def get_sandbox_statistics(self, sandbox_id: str): + """Get sandbox statistics.""" + raise NotImplementedError + + async def commit(self, sandbox_id: str, image_tag: str, username: str, password: str) -> CommandResponse: + """Commit sandbox to image.""" + raise NotImplementedError + + async def collect_system_resource_metrics(self) -> SystemResourceMetrics: + """Collect system resource metrics.""" + raise NotImplementedError \ No newline at end of file diff --git a/rock/sandbox/service/deployment_service.py b/rock/sandbox/service/deployment/ray.py similarity index 82% rename from rock/sandbox/service/deployment_service.py rename to rock/sandbox/service/deployment/ray.py index 7bf99a742..0245bfed1 100644 --- a/rock/sandbox/service/deployment_service.py +++ b/rock/sandbox/service/deployment/ray.py @@ -1,57 +1,19 @@ -from abc import ABC, abstractmethod from rock.actions.sandbox.response import CommandResponse, State, SystemResourceMetrics from rock.actions.sandbox.sandbox_info import SandboxInfo from rock.admin.core.ray_service import RayService import ray -from rock.deployments.config import DeploymentConfig, DockerDeploymentConfig +from rock.deployments.config import DockerDeploymentConfig from rock.deployments.docker import DockerDeployment from rock.deployments.status import ServiceStatus from rock.logger import init_logger from rock.sandbox.sandbox_actor import SandboxActor +from rock.sandbox.service.deployment.abstract import AbstractDeploymentService from rock.sdk.common.exceptions import BadRequestRockError from rock.utils.format import parse_memory_size logger = init_logger(__name__) - -class AbstractDeploymentService(ABC): - """Abstract base class for deployment services implementing IDeploymentService.""" - - @abstractmethod - async def is_alive(self, sandbox_id: str) -> bool: - ... - - @abstractmethod - async def submit(self, config: DeploymentConfig, user_info: dict) -> SandboxInfo: - """Get status of sandbox.""" - ... - - @abstractmethod - async def get_status(self, sandbox_id: str) -> SandboxInfo: - """Get status of sandbox.""" - ... - - @abstractmethod - async def stop(self, sandbox_id: str): - """Stop sandbox.""" - - async def get_mount(self, sandbox_id: str): - """Get mount of sandbox.""" - raise NotImplementedError - - async def get_sandbox_statistics(self, sandbox_id: str): - """Get sandbox statistics.""" - raise NotImplementedError - - async def commit(self, sandbox_id: str, image_tag: str, username: str, password: str) -> CommandResponse: - """Commit sandbox to image.""" - raise NotImplementedError - - async def collect_system_resource_metrics(self) -> SystemResourceMetrics: - """Collect system resource metrics.""" - raise NotImplementedError - class RayDeploymentService(AbstractDeploymentService): def __init__(self, ray_namespace: str, ray_service: RayService): self._ray_namespace = ray_namespace diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3529fa7e1..1cb571624 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -11,7 +11,7 @@ from rock.deployments.config import DockerDeploymentConfig from rock.logger import init_logger from rock.sandbox.sandbox_manager import SandboxManager -from rock.sandbox.service.deployment_service import RayDeploymentService +from rock.sandbox.service.deployment.ray import RayDeploymentService from rock.sandbox.service.sandbox_proxy_service import SandboxProxyService from rock.utils.providers.redis_provider import RedisProvider from rock.admin.core.ray_service import RayService