Skip to content

Commit

Permalink
[Core] Add retry for ssh max length (#3884)
Browse files Browse the repository at this point in the history
* add retry for max length

* format

* fix

* format

* Fix and add test

* format

* format

* format
  • Loading branch information
Michaelvll authored Sep 2, 2024
1 parent 9fedbee commit 0203971
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 14 deletions.
68 changes: 54 additions & 14 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3112,7 +3112,8 @@ def _setup_node(node_id: int) -> None:
setup_script = log_lib.make_task_bash_script(setup,
env_vars=setup_envs)
encoded_script = shlex.quote(setup_script)
if detach_setup or _is_command_length_over_limit(encoded_script):

def _dump_setup_script(setup_script: str) -> None:
with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
f.write(setup_script)
f.flush()
Expand All @@ -3121,27 +3122,52 @@ def _setup_node(node_id: int) -> None:
target=remote_setup_file_name,
up=True,
stream_logs=False)

if detach_setup or _is_command_length_over_limit(encoded_script):
_dump_setup_script(setup_script)
create_script_code = 'true'
else:
create_script_code = (f'{{ echo {encoded_script} > '
f'{remote_setup_file_name}; }}')

if detach_setup:
return

setup_log_path = os.path.join(self.log_dir,
f'setup-{runner.node_id}.log')
returncode = runner.run(
f'{create_script_code} && {setup_cmd}',
log_path=setup_log_path,
process_stream=False,
# We do not source bashrc for setup, since bashrc is sourced
# in the script already.
# Skip an empty line and two lines due to the /bin/bash -i and
# source ~/.bashrc in the setup_cmd.
# bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
# bash: no job control in this shell
skip_lines=3,
)

def _run_setup(setup_cmd: str) -> int:
returncode = runner.run(
setup_cmd,
log_path=setup_log_path,
process_stream=False,
# We do not source bashrc for setup, since bashrc is sourced
# in the script already.
# Skip an empty line and two lines due to the /bin/bash -i
# and source ~/.bashrc in the setup_cmd.
# bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
# bash: no job control in this shell
skip_lines=3)
return returncode

returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
if returncode == 255:
is_message_too_long = False
with open(setup_log_path, 'r', encoding='utf-8') as f:
if 'too long' in f.read():
is_message_too_long = True

if is_message_too_long:
# If the setup script is too long, we retry it with dumping
# the script to a file and running it with SSH. We use a
# general length limit check before but it could be
# inaccurate on some systems.
logger.debug(
'Failed to run setup command inline due to '
'command length limit. Dumping setup script to '
'file and running it with SSH.')
_dump_setup_script(setup_script)
returncode = _run_setup(setup_cmd)

def error_message() -> str:
# Use the function to avoid tailing the file in success case
Expand Down Expand Up @@ -3223,7 +3249,8 @@ def _exec_code_on_head(

code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
if _is_command_length_over_limit(job_submit_cmd):

def _dump_code_to_file(codegen: str) -> None:
runners = handle.get_command_runners()
head_runner = runners[0]
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
Expand All @@ -3238,6 +3265,9 @@ def _exec_code_on_head(
target=script_path,
up=True,
stream_logs=False)

if _is_command_length_over_limit(job_submit_cmd):
_dump_code_to_file(codegen)
job_submit_cmd = f'{mkdir_code} && {code}'

if managed_job_dag is not None:
Expand All @@ -3263,6 +3293,16 @@ def _exec_code_on_head(
job_submit_cmd,
stream_logs=False,
require_outputs=True)
if returncode == 255 and 'too long' in stdout + stderr:
# If the setup script is too long, we retry it with dumping
# the script to a file and running it with SSH. We use a general
# length limit check before but it could be inaccurate on some
# systems.
_dump_code_to_file(codegen)
returncode, stdout, stderr = self.run_on_head(handle,
job_submit_cmd,
stream_logs=False,
require_outputs=True)

# Happens when someone calls `sky exec` but remote is outdated
# necessitating calling `sky launch`.
Expand Down
4 changes: 4 additions & 0 deletions sky/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,13 @@ def __init__(self, returncode: int, command: str, error_msg: str,
self.command = command
self.error_msg = error_msg
self.detailed_reason = detailed_reason

if not command:
message = error_msg
else:
if len(command) > 100:
# Chunck the command to avoid overflow.
command = command[:100] + '...'
message = (f'Command {command} failed with return code '
f'{returncode}.\n{error_msg}')
super().__init__(message)
Expand Down
38 changes: 38 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import subprocess
import sys
import tempfile
import textwrap
import time
from typing import Dict, List, NamedTuple, Optional, Tuple
import urllib.parse
Expand Down Expand Up @@ -3436,6 +3437,43 @@ def test_gcp_zero_quota_failover():
run_one_test(test)


def test_long_setup_run_script(generic_cloud: str):
name = _get_cluster_name()
with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
suffix='.yaml') as f:
f.write(
textwrap.dedent(""" \
setup: |
echo "start long setup"
"""))
for i in range(1024 * 120):
f.write(f' echo {i}\n')
f.write(' echo "end long setup"\n')
f.write(
textwrap.dedent(""" \
run: |
echo "run"
"""))
for i in range(1024 * 120):
f.write(f' echo {i}\n')
f.write(' echo "end run"\n')
f.flush()

test = Test(
'long-setup-run-script',
[
f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup --detach-run --cpus 2+ {f.name}',
f'sky exec --detach-run {name} "echo hello"',
f'sky exec --detach-run {name} {f.name}',
f'sky logs {name} --status 1',
f'sky logs {name} --status 2',
f'sky logs {name} --status 3',
],
f'sky down -y {name}',
)
run_one_test(test)


# ---------- Testing skyserve ----------


Expand Down

0 comments on commit 0203971

Please sign in to comment.