Skip to content

Commit

Permalink
fix smoke tests bug (#4492)
Browse files Browse the repository at this point in the history
* fix release smoke tests

* robust backward compatibility

* bug fix

* support gke mark

* update retry field

* fix

* fix smoke tests issue

* update command of GCP

* comment for azure fix

* clear resources

* remove legacy_credentials

* bug fix

* resolve PR comment

* update comment

* text param for subprocess

* longer timeout

* rename to is_on_gcp

* revert initial_delay

* cache based on boot time

* fix command

* fix bug

* add TODO
  • Loading branch information
zpoint authored Jan 14, 2025
1 parent 051b5fa commit 35f0cf4
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 23 deletions.
20 changes: 15 additions & 5 deletions .buildkite/generate_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
QUEUE_GENERIC_CLOUD = 'generic_cloud'
QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
QUEUE_KUBERNETES = 'kubernetes'
QUEUE_KUBERNETES_SERVE = 'kubernetes_serve'
QUEUE_GKE = 'gke'
# Only aws, gcp, azure, and kubernetes are supported for now.
# Other clouds do not have credentials.
CLOUD_QUEUE_MAP = {
Expand All @@ -54,7 +54,9 @@
'aws': QUEUE_GENERIC_CLOUD_SERVE,
'gcp': QUEUE_GENERIC_CLOUD_SERVE,
'azure': QUEUE_GENERIC_CLOUD_SERVE,
'kubernetes': QUEUE_KUBERNETES_SERVE
# Now we run kubernetes on local cluster, so it should be find if we run
# serve tests on same queue as kubernetes.
'kubernetes': QUEUE_KUBERNETES
}

GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by '
Expand Down Expand Up @@ -98,6 +100,7 @@ def _extract_marked_tests(file_path: str,
clouds_to_include = []
clouds_to_exclude = []
is_serve_test = 'serve' in marks
run_on_gke = 'requires_gke' in marks
for mark in marks:
if mark.startswith('no_'):
clouds_to_exclude.append(mark[3:])
Expand Down Expand Up @@ -130,13 +133,15 @@ def _extract_marked_tests(file_path: str,
f'but we only have credentials for {final_clouds_to_include}. '
f'clouds {excluded_clouds} are skipped.')
function_cloud_map[function_name] = (final_clouds_to_include, [
cloud_queue_map[cloud] for cloud in final_clouds_to_include
QUEUE_GKE if run_on_gke else cloud_queue_map[cloud]
for cloud in final_clouds_to_include
])
return function_cloud_map


def _generate_pipeline(test_file: str,
filter_marks: List[str]) -> Dict[str, Any]:
filter_marks: List[str],
auto_retry: bool = False) -> Dict[str, Any]:
"""Generate a Buildkite pipeline from test files."""
steps = []
function_cloud_map = _extract_marked_tests(test_file, filter_marks)
Expand All @@ -153,6 +158,11 @@ def _generate_pipeline(test_file: str,
},
'if': f'build.env("{cloud}") == "1"'
}
if auto_retry:
step['retry'] = {
# Automatically retry 2 times on any failure by default.
'automatic': True
}
steps.append(step)
return {'steps': steps}

Expand Down Expand Up @@ -180,7 +190,7 @@ def _convert_release(test_files: List[str], filter_marks: List[str]):
output_file_pipelines = []
for test_file in test_files:
print(f'Converting {test_file} to {yaml_file_path}')
pipeline = _generate_pipeline(test_file, filter_marks)
pipeline = _generate_pipeline(test_file, filter_marks, auto_retry=True)
output_file_pipelines.append(pipeline)
print(f'Converted {test_file} to {yaml_file_path}\n\n')
# Enable all clouds by default for release pipeline.
Expand Down
8 changes: 7 additions & 1 deletion sky/data/mounting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,21 @@ def get_az_mount_cmd(container_name: str,
cache_path = _BLOBFUSE_CACHE_DIR.format(
storage_account_name=storage_account_name,
container_name=container_name)
# The line below ensures the cache directory is new before mounting to
# avoid "config error in file_cache [temp directory not empty]" error, which
# can occur after stopping and starting the same cluster on Azure.
# This helps ensure a clean state for blobfuse2 operations.
remote_boot_time_cmd = 'date +%s -d "$(uptime -s)"'
if _bucket_sub_path is None:
bucket_sub_path_arg = ''
else:
bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ '
# TODO(zpoint): clear old cache that has been created in the previous boot.
mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} '
f'{key_env_var} '
f'blobfuse2 {mount_path} --allow-other --no-symlinks '
'-o umask=022 -o default_permissions '
f'--tmp-path {cache_path} '
f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
f'{bucket_sub_path_arg}'
f'--container-name {container_name}')
return mount_cmd
Expand Down
11 changes: 8 additions & 3 deletions tests/backward_compatibility_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ uv pip install --prerelease=allow "azure-cli>=2.65.0"
uv pip install -e ".[all]"


clear_resources() {
sky down ${CLUSTER_NAME}* -y
sky jobs cancel -n ${MANAGED_JOB_JOB_NAME}* -y
}

# Set trap to call cleanup on script exit
trap clear_resources EXIT

# exec + launch
if [ "$start_from" -le 1 ]; then
conda activate sky-back-compat-master
Expand Down Expand Up @@ -193,6 +201,3 @@ echo "$s"
echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1
echo "$s" | grep "CANCELLING\|CANCELLED" | wc -l | grep 1 || exit 1
fi

sky down ${CLUSTER_NAME}* -y
sky jobs cancel -n ${MANAGED_JOB_JOB_NAME}* -y
14 changes: 6 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ def _get_cloud_to_run(config) -> List[str]:


def pytest_collection_modifyitems(config, items):
if config.option.collectonly:
for item in items:
full_name = item.nodeid
marks = [mark.name for mark in item.iter_markers()]
print(f"Collected {full_name} with marks: {marks}")

skip_marks = {}
skip_marks['slow'] = pytest.mark.skip(reason='need --runslow option to run')
skip_marks['managed_jobs'] = pytest.mark.skip(
Expand Down Expand Up @@ -220,11 +226,3 @@ def aws_config_region(monkeypatch: pytest.MonkeyPatch) -> str:
if isinstance(ssh_proxy_command, dict) and ssh_proxy_command:
region = list(ssh_proxy_command.keys())[0]
return region


def pytest_collection_modifyitems(config, items):
if config.option.collectonly:
for item in items:
full_name = item.nodeid
marks = [mark.name for mark in item.iter_markers()]
print(f"Collected {full_name} with marks: {marks}")
2 changes: 1 addition & 1 deletion tests/smoke_tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ def test_kubernetes_context_failover():
'kubectl get namespaces --context kind-skypilot | grep test-namespace || '
'{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in '
'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"',
'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 4, 8"',
# Get contexts and set current context to the other cluster that is not kind-skypilot
f'kubectl config use-context {context}',
# H100 should not in the current context
Expand Down
7 changes: 5 additions & 2 deletions tests/smoke_tests/test_cluster_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ def test_tpu_vm_pod():


# ---------- TPU Pod Slice on GKE. ----------
@pytest.mark.requires_gke
@pytest.mark.kubernetes
def test_tpu_pod_slice_gke():
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -876,6 +877,7 @@ def test_add_and_remove_pod_annotations_with_autostop():


# ---------- Container logs from task on Kubernetes ----------
@pytest.mark.requires_gke
@pytest.mark.kubernetes
def test_container_logs_multinode_kubernetes():
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -1429,6 +1431,7 @@ def test_aws_custom_image():
smoke_tests_utils.run_one_test(test)


@pytest.mark.requires_gke
@pytest.mark.kubernetes
@pytest.mark.parametrize(
'image_id',
Expand Down Expand Up @@ -1570,7 +1573,7 @@ def test_azure_disk_tier():
name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
name_on_cloud = common_utils.make_cluster_name_on_cloud(
name, sky.Azure.max_cluster_name_length())
region = 'westus2'
region = 'eastus2'
test = smoke_tests_utils.Test(
'azure-disk-tier-' + disk_tier.value,
[
Expand All @@ -1592,7 +1595,7 @@ def test_azure_best_tier_failover():
name = smoke_tests_utils.get_cluster_name()
name_on_cloud = common_utils.make_cluster_name_on_cloud(
name, sky.Azure.max_cluster_name_length())
region = 'westus2'
region = 'eastus2'
test = smoke_tests_utils.Test(
'azure-best-tier-failover',
[
Expand Down
18 changes: 18 additions & 0 deletions tests/smoke_tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
# Change cloud for generic tests to aws
# > pytest tests/smoke_tests/test_images.py --generic-cloud aws

import os
import subprocess

import pytest
from smoke_tests import smoke_tests_utils

Expand Down Expand Up @@ -345,6 +348,21 @@ def test_gcp_mig():
@pytest.mark.gcp
def test_gcp_force_enable_external_ips():
name = smoke_tests_utils.get_cluster_name()

# Command to check if the instance is on GCP
is_on_gcp_command = (
'curl -s -H "Metadata-Flavor: Google" '
'"http://metadata.google.internal/computeMetadata/v1/instance/name"')

# Run the GCP check
is_on_gcp = subprocess.run(f'{is_on_gcp_command}',
shell=True,
check=False,
text=True,
capture_output=True).stdout.strip()
if not is_on_gcp:
pytest.skip('Not on GCP, skipping test')

test_commands = [
f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml',
# Check network of vm is "default"
Expand Down
2 changes: 1 addition & 1 deletion tests/smoke_tests/test_managed_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ def test_managed_jobs_storage(generic_cloud: str):
storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
output_check_cmd = f'{gcs_check_file_count} | grep 1'
elif generic_cloud == 'azure':
region = 'westus2'
region = 'centralus'
region_flag = f' --region {region}'
storage_account_name = (
storage_lib.AzureBlobStore.get_default_storage_account_name(region))
Expand Down
4 changes: 2 additions & 2 deletions tests/smoke_tests/test_mount_and_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,8 +1605,8 @@ def test_aws_regions(self, tmp_local_storage_obj, region):
'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12',
'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2',
'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2',
'asia-southeast2', 'me-central1', 'me-central2', 'me-west1',
'australia-southeast1', 'australia-southeast2', 'africa-south1'
'asia-southeast2', 'me-central1', 'me-west1', 'australia-southeast1',
'australia-southeast2', 'africa-south1'
])
def test_gcs_regions(self, tmp_local_storage_obj, region):
# This tests creation and upload to bucket in all GCS regions
Expand Down
1 change: 1 addition & 0 deletions tests/smoke_tests/test_sky_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def test_skyserve_azure_http():

@pytest.mark.kubernetes
@pytest.mark.serve
@pytest.mark.requires_gke
def test_skyserve_kubernetes_http():
"""Test skyserve on Kubernetes"""
name = _get_service_name()
Expand Down

0 comments on commit 35f0cf4

Please sign in to comment.