diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 74480c9ee4d..3889105d02d 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -38,7 +38,7 @@ QUEUE_GENERIC_CLOUD = 'generic_cloud' QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve' QUEUE_KUBERNETES = 'kubernetes' -QUEUE_KUBERNETES_SERVE = 'kubernetes_serve' +QUEUE_GKE = 'gke' # Only aws, gcp, azure, and kubernetes are supported for now. # Other clouds do not have credentials. CLOUD_QUEUE_MAP = { @@ -54,7 +54,9 @@ 'aws': QUEUE_GENERIC_CLOUD_SERVE, 'gcp': QUEUE_GENERIC_CLOUD_SERVE, 'azure': QUEUE_GENERIC_CLOUD_SERVE, - 'kubernetes': QUEUE_KUBERNETES_SERVE + # Now we run kubernetes on local cluster, so it should be find if we run + # serve tests on same queue as kubernetes. + 'kubernetes': QUEUE_KUBERNETES } GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by ' @@ -98,6 +100,7 @@ def _extract_marked_tests(file_path: str, clouds_to_include = [] clouds_to_exclude = [] is_serve_test = 'serve' in marks + run_on_gke = 'requires_gke' in marks for mark in marks: if mark.startswith('no_'): clouds_to_exclude.append(mark[3:]) @@ -130,13 +133,15 @@ def _extract_marked_tests(file_path: str, f'but we only have credentials for {final_clouds_to_include}. ' f'clouds {excluded_clouds} are skipped.') function_cloud_map[function_name] = (final_clouds_to_include, [ - cloud_queue_map[cloud] for cloud in final_clouds_to_include + QUEUE_GKE if run_on_gke else cloud_queue_map[cloud] + for cloud in final_clouds_to_include ]) return function_cloud_map def _generate_pipeline(test_file: str, - filter_marks: List[str]) -> Dict[str, Any]: + filter_marks: List[str], + auto_retry: bool = False) -> Dict[str, Any]: """Generate a Buildkite pipeline from test files.""" steps = [] function_cloud_map = _extract_marked_tests(test_file, filter_marks) @@ -153,6 +158,11 @@ def _generate_pipeline(test_file: str, }, 'if': f'build.env("{cloud}") == "1"' } + if auto_retry: + step['retry'] = { + # Automatically retry 2 times on any failure by default. + 'automatic': True + } steps.append(step) return {'steps': steps} @@ -180,7 +190,7 @@ def _convert_release(test_files: List[str], filter_marks: List[str]): output_file_pipelines = [] for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') - pipeline = _generate_pipeline(test_file, filter_marks) + pipeline = _generate_pipeline(test_file, filter_marks, auto_retry=True) output_file_pipelines.append(pipeline) print(f'Converted {test_file} to {yaml_file_path}\n\n') # Enable all clouds by default for release pipeline. diff --git a/sky/data/mounting_utils.py b/sky/data/mounting_utils.py index d2a95a3c20b..a00a73aa9e0 100644 --- a/sky/data/mounting_utils.py +++ b/sky/data/mounting_utils.py @@ -122,15 +122,21 @@ def get_az_mount_cmd(container_name: str, cache_path = _BLOBFUSE_CACHE_DIR.format( storage_account_name=storage_account_name, container_name=container_name) + # The line below ensures the cache directory is new before mounting to + # avoid "config error in file_cache [temp directory not empty]" error, which + # can occur after stopping and starting the same cluster on Azure. + # This helps ensure a clean state for blobfuse2 operations. + remote_boot_time_cmd = 'date +%s -d "$(uptime -s)"' if _bucket_sub_path is None: bucket_sub_path_arg = '' else: bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ ' + # TODO(zpoint): clear old cache that has been created in the previous boot. mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} ' f'{key_env_var} ' f'blobfuse2 {mount_path} --allow-other --no-symlinks ' '-o umask=022 -o default_permissions ' - f'--tmp-path {cache_path} ' + f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) ' f'{bucket_sub_path_arg}' f'--container-name {container_name}') return mount_cmd diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index d32e1e9e224..941cd455e64 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -50,6 +50,14 @@ uv pip install --prerelease=allow "azure-cli>=2.65.0" uv pip install -e ".[all]" +clear_resources() { + sky down ${CLUSTER_NAME}* -y + sky jobs cancel -n ${MANAGED_JOB_JOB_NAME}* -y +} + +# Set trap to call cleanup on script exit +trap clear_resources EXIT + # exec + launch if [ "$start_from" -le 1 ]; then conda activate sky-back-compat-master @@ -193,6 +201,3 @@ echo "$s" echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1 echo "$s" | grep "CANCELLING\|CANCELLED" | wc -l | grep 1 || exit 1 fi - -sky down ${CLUSTER_NAME}* -y -sky jobs cancel -n ${MANAGED_JOB_JOB_NAME}* -y diff --git a/tests/conftest.py b/tests/conftest.py index cfbdf808a6b..af6367fdac6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -121,6 +121,12 @@ def _get_cloud_to_run(config) -> List[str]: def pytest_collection_modifyitems(config, items): + if config.option.collectonly: + for item in items: + full_name = item.nodeid + marks = [mark.name for mark in item.iter_markers()] + print(f"Collected {full_name} with marks: {marks}") + skip_marks = {} skip_marks['slow'] = pytest.mark.skip(reason='need --runslow option to run') skip_marks['managed_jobs'] = pytest.mark.skip( @@ -220,11 +226,3 @@ def aws_config_region(monkeypatch: pytest.MonkeyPatch) -> str: if isinstance(ssh_proxy_command, dict) and ssh_proxy_command: region = list(ssh_proxy_command.keys())[0] return region - - -def pytest_collection_modifyitems(config, items): - if config.option.collectonly: - for item in items: - full_name = item.nodeid - marks = [mark.name for mark in item.iter_markers()] - print(f"Collected {full_name} with marks: {marks}") diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py index aead10fa733..30576d3272f 100644 --- a/tests/smoke_tests/test_basic.py +++ b/tests/smoke_tests/test_basic.py @@ -571,7 +571,7 @@ def test_kubernetes_context_failover(): 'kubectl get namespaces --context kind-skypilot | grep test-namespace || ' '{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in ' 'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }', - 'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"', + 'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 4, 8"', # Get contexts and set current context to the other cluster that is not kind-skypilot f'kubectl config use-context {context}', # H100 should not in the current context diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py index dabea9c8752..1fbb1b3d875 100644 --- a/tests/smoke_tests/test_cluster_job.py +++ b/tests/smoke_tests/test_cluster_job.py @@ -567,6 +567,7 @@ def test_tpu_vm_pod(): # ---------- TPU Pod Slice on GKE. ---------- +@pytest.mark.requires_gke @pytest.mark.kubernetes def test_tpu_pod_slice_gke(): name = smoke_tests_utils.get_cluster_name() @@ -876,6 +877,7 @@ def test_add_and_remove_pod_annotations_with_autostop(): # ---------- Container logs from task on Kubernetes ---------- +@pytest.mark.requires_gke @pytest.mark.kubernetes def test_container_logs_multinode_kubernetes(): name = smoke_tests_utils.get_cluster_name() @@ -1429,6 +1431,7 @@ def test_aws_custom_image(): smoke_tests_utils.run_one_test(test) +@pytest.mark.requires_gke @pytest.mark.kubernetes @pytest.mark.parametrize( 'image_id', @@ -1570,7 +1573,7 @@ def test_azure_disk_tier(): name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.Azure.max_cluster_name_length()) - region = 'westus2' + region = 'eastus2' test = smoke_tests_utils.Test( 'azure-disk-tier-' + disk_tier.value, [ @@ -1592,7 +1595,7 @@ def test_azure_best_tier_failover(): name = smoke_tests_utils.get_cluster_name() name_on_cloud = common_utils.make_cluster_name_on_cloud( name, sky.Azure.max_cluster_name_length()) - region = 'westus2' + region = 'eastus2' test = smoke_tests_utils.Test( 'azure-best-tier-failover', [ diff --git a/tests/smoke_tests/test_images.py b/tests/smoke_tests/test_images.py index 27d6a693ae6..14769161675 100644 --- a/tests/smoke_tests/test_images.py +++ b/tests/smoke_tests/test_images.py @@ -19,6 +19,9 @@ # Change cloud for generic tests to aws # > pytest tests/smoke_tests/test_images.py --generic-cloud aws +import os +import subprocess + import pytest from smoke_tests import smoke_tests_utils @@ -345,6 +348,21 @@ def test_gcp_mig(): @pytest.mark.gcp def test_gcp_force_enable_external_ips(): name = smoke_tests_utils.get_cluster_name() + + # Command to check if the instance is on GCP + is_on_gcp_command = ( + 'curl -s -H "Metadata-Flavor: Google" ' + '"http://metadata.google.internal/computeMetadata/v1/instance/name"') + + # Run the GCP check + is_on_gcp = subprocess.run(f'{is_on_gcp_command}', + shell=True, + check=False, + text=True, + capture_output=True).stdout.strip() + if not is_on_gcp: + pytest.skip('Not on GCP, skipping test') + test_commands = [ f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml', # Check network of vm is "default" diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 4a16b469e5a..12d1654a773 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -698,7 +698,7 @@ def test_managed_jobs_storage(generic_cloud: str): storage_lib.StoreType.GCS, output_storage_name, 'output.txt') output_check_cmd = f'{gcs_check_file_count} | grep 1' elif generic_cloud == 'azure': - region = 'westus2' + region = 'centralus' region_flag = f' --region {region}' storage_account_name = ( storage_lib.AzureBlobStore.get_default_storage_account_name(region)) diff --git a/tests/smoke_tests/test_mount_and_storage.py b/tests/smoke_tests/test_mount_and_storage.py index 89a849ad090..3f2ddb16c57 100644 --- a/tests/smoke_tests/test_mount_and_storage.py +++ b/tests/smoke_tests/test_mount_and_storage.py @@ -1605,8 +1605,8 @@ def test_aws_regions(self, tmp_local_storage_obj, region): 'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12', 'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2', 'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2', - 'asia-southeast2', 'me-central1', 'me-central2', 'me-west1', - 'australia-southeast1', 'australia-southeast2', 'africa-south1' + 'asia-southeast2', 'me-central1', 'me-west1', 'australia-southeast1', + 'australia-southeast2', 'africa-south1' ]) def test_gcs_regions(self, tmp_local_storage_obj, region): # This tests creation and upload to bucket in all GCS regions diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py index 2e820d30d52..3ba36d8a092 100644 --- a/tests/smoke_tests/test_sky_serve.py +++ b/tests/smoke_tests/test_sky_serve.py @@ -180,6 +180,7 @@ def test_skyserve_azure_http(): @pytest.mark.kubernetes @pytest.mark.serve +@pytest.mark.requires_gke def test_skyserve_kubernetes_http(): """Test skyserve on Kubernetes""" name = _get_service_name()