From 061d4bd998739e16fa704a855233e62290f6cbdb Mon Sep 17 00:00:00 2001 From: Chester Li Date: Sat, 4 Jan 2025 03:29:33 +0800 Subject: [PATCH] [k8s] Add validation for pod_config #4206 (#4466) * [k8s] Add validation for pod_config #4206 Check pod_config when run 'sky check k8s' by using k8s api * update: check pod_config when launch check merged pod_config during launch using k8s api * fix test * ignore check failed when test with dryrun if there is no kube config in env, ignore ValueError when launch with dryrun. For now, we don't support check schema offline. * use deserialize api to check pod_config schema * test * create another api_client with no kubeconfig * test * update error message * update test * test * test * Update sky/backends/backend_utils.py --------- Co-authored-by: Romil Bhardwaj --- sky/backends/backend_utils.py | 7 +++++ sky/provision/kubernetes/utils.py | 46 +++++++++++++++++++++++++++++++ tests/test_config.py | 46 +++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 1de799e7cf8..6e79469a819 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -926,6 +926,13 @@ def write_cluster_config( tmp_yaml_path, cluster_config_overrides=to_provision.cluster_config_overrides) kubernetes_utils.combine_metadata_fields(tmp_yaml_path) + yaml_obj = common_utils.read_yaml(tmp_yaml_path) + pod_config = yaml_obj['available_node_types']['ray_head_default'][ + 'node_config'] + valid, message = kubernetes_utils.check_pod_config(pod_config) + if not valid: + raise exceptions.InvalidCloudConfigs( + f'Invalid pod_config. Details: {message}') if dryrun: # If dryrun, return the unfinished tmp yaml path. diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 487868d1d9e..14b6b42aa58 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -893,6 +893,52 @@ def check_credentials(context: Optional[str], return True, None +def check_pod_config(pod_config: dict) \ + -> Tuple[bool, Optional[str]]: + """Check if the pod_config is a valid pod config + + Using deserialize api to check the pod_config is valid or not. + + Returns: + bool: True if pod_config is valid. + str: Error message about why the pod_config is invalid, None otherwise. + """ + errors = [] + # This api_client won't be used to send any requests, so there is no need to + # load kubeconfig + api_client = kubernetes.kubernetes.client.ApiClient() + + # Used for kubernetes api_client deserialize function, the function will use + # data attr, the detail ref: + # https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244 + class InnerResponse(): + + def __init__(self, data: dict): + self.data = json.dumps(data) + + try: + # Validate metadata if present + if 'metadata' in pod_config: + try: + value = InnerResponse(pod_config['metadata']) + api_client.deserialize( + value, kubernetes.kubernetes.client.V1ObjectMeta) + except ValueError as e: + errors.append(f'Invalid metadata: {str(e)}') + # Validate spec if present + if 'spec' in pod_config: + try: + value = InnerResponse(pod_config['spec']) + api_client.deserialize(value, + kubernetes.kubernetes.client.V1PodSpec) + except ValueError as e: + errors.append(f'Invalid spec: {str(e)}') + return len(errors) == 0, '.'.join(errors) + except Exception as e: # pylint: disable=broad-except + errors.append(f'Validation error: {str(e)}') + return False, '.'.join(errors) + + def is_kubeconfig_exec_auth( context: Optional[str] = None) -> Tuple[bool, Optional[str]]: """Checks if the kubeconfig file uses exec-based authentication diff --git a/tests/test_config.py b/tests/test_config.py index 5789214dc61..d3eaeb261bc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -7,6 +7,7 @@ import sky from sky import skypilot_config +import sky.exceptions from sky.skylet import constants from sky.utils import common_utils from sky.utils import kubernetes_enums @@ -99,6 +100,29 @@ def _create_task_yaml_file(task_file_path: pathlib.Path) -> None: """)) +def _create_invalid_config_yaml_file(task_file_path: pathlib.Path) -> None: + task_file_path.write_text( + textwrap.dedent("""\ + experimental: + config_overrides: + kubernetes: + pod_config: + metadata: + labels: + test-key: test-value + annotations: + abc: def + spec: + containers: + - name: + imagePullSecrets: + - name: my-secret-2 + + setup: echo 'Setting up...' + run: echo 'Running...' + """)) + + def test_nested_config(monkeypatch) -> None: """Test that the nested config works.""" config = skypilot_config.Config() @@ -335,6 +359,28 @@ def test_k8s_config_with_override(monkeypatch, tmp_path, assert cluster_pod_config['spec']['runtimeClassName'] == 'nvidia' +def test_k8s_config_with_invalid_config(monkeypatch, tmp_path, + enable_all_clouds) -> None: + config_path = tmp_path / 'config.yaml' + _create_config_file(config_path) + monkeypatch.setattr(skypilot_config, 'CONFIG_PATH', config_path) + + _reload_config() + task_path = tmp_path / 'task.yaml' + _create_invalid_config_yaml_file(task_path) + task = sky.Task.from_yaml(task_path) + + # Test Kubernetes pod_config invalid + cluster_name = 'test_k8s_config_with_invalid_config' + task.set_resources_override({'cloud': sky.Kubernetes()}) + exception_occurred = False + try: + sky.launch(task, cluster_name=cluster_name, dryrun=True) + except sky.exceptions.ResourcesUnavailableError: + exception_occurred = True + assert exception_occurred + + def test_gcp_config_with_override(monkeypatch, tmp_path, enable_all_clouds) -> None: config_path = tmp_path / 'config.yaml'