From 0081588462a6f39ac4ae585aeb30bbc68b51eb3c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 21 Nov 2024 15:23:13 +0100 Subject: [PATCH] [fine_tuning] testing: add cluster scale up in the Ilab-on-RHOAI training --- .../fine_tuning/testing/command_args.yml.j2 | 2 +- projects/fine_tuning/testing/config.yaml | 23 +++++-------------- .../fine_tuning/testing/prepare_finetuning.py | 21 +++++++++-------- .../fine_tuning/testing/test_finetuning.py | 9 +++++++- 4 files changed, 26 insertions(+), 29 deletions(-) diff --git a/projects/fine_tuning/testing/command_args.yml.j2 b/projects/fine_tuning/testing/command_args.yml.j2 index 0be1e4213..53f2778b7 100644 --- a/projects/fine_tuning/testing/command_args.yml.j2 +++ b/projects/fine_tuning/testing/command_args.yml.j2 @@ -20,7 +20,7 @@ sutest/cluster set_scale: gpu_operator enable_time_sharing: replicas: {{ gpu.time_sharing.replicas }} -sutest/cluster preload_image/kserve-runtime: +sutest/cluster preload_image: namespace: "{{ tests.fine_tuning.namespace }}" name: fine-tuning-image {% if tests.fine_tuning.fms.enabled %} diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml index 04dd70d73..e6a278e73 100644 --- a/projects/fine_tuning/testing/config.yaml +++ b/projects/fine_tuning/testing/config.yaml @@ -266,13 +266,10 @@ ci_presets: metal: clusters.sutest.is_metal: true - clusters.driver.is_metal: true clusters.sutest.compute.dedicated: false - clusters.driver.compute.dedicated: false not_metal: clusters.sutest.is_metal: false - clusters.driver.is_metal: false use_intlab_os: matbench.lts.opensearch.index_prefix: "psap-rhoai." @@ -364,7 +361,7 @@ ci_presets: tests.fine_tuning.test_settings.pod_count: 2 tests.fine_tuning.test_settings.gpu: 1 - + clusters.sutest.compute.machineset.count: 4 tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 8 tests.fine_tuning.matbenchmarking.enabled: true tests.fine_tuning.matbenchmarking.stop_on_error: true @@ -372,8 +369,11 @@ ci_presets: # --- cluster_instructlab: - clusters.sutest.compute.machineset.type: gx3-48x240x2l40s fine_tuning.pvc.storage_class_name: nfs-csi + clusters.sutest.is_metal: false + clusters.sutest.compute.machineset.rest_count: 1 + clusters.sutest.compute.machineset.type: gx3-48x240x2l40s + clusters.sutest.compute.machineset.name: instructlab-standalon-6rjg8-worker-1 cluster_ibm_dgx: clusters.sutest.compute.machineset.type: "IBM-DGX A100-80GB" @@ -445,22 +445,11 @@ clusters: name: workload-pods type: m6i.2xlarge count: null + rest_count: null taint: key: only-workload-pods value: "yes" effect: NoSchedule - driver: - is_metal: false - compute: - dedicated: true - machineset: - name: test-pods - count: null - type: m6i.2xlarge - taint: - key: only-test-pods - value: "yes" - effect: NoSchedule cleanup_on_exit: false rhods: diff --git a/projects/fine_tuning/testing/prepare_finetuning.py b/projects/fine_tuning/testing/prepare_finetuning.py index 334e069f6..8a749f361 100644 --- a/projects/fine_tuning/testing/prepare_finetuning.py +++ b/projects/fine_tuning/testing/prepare_finetuning.py @@ -13,7 +13,7 @@ def prepare(): with run.Parallel("prepare1") as parallel: parallel.delayed(prepare_rhoai) - parallel.delayed(scale_up_sutest) + parallel.delayed(scale_up) test_settings = config.project.get_config("tests.fine_tuning.test_settings") @@ -220,9 +220,9 @@ def prepare_namespace(test_settings): with env.NextArtifactDir("prepare_namespace"): set_namespace_annotations() - with run.Parallel("download") as parallel: - parallel.delayed(download_data_sources, test_settings) - parallel.delayed(preload_image) + with run.Parallel("prepare_data") as parallel: + parallel.delayed(download_data_sources, test_settings) + parallel.delayed(preload_image) if not dry_mode: run.run(f"oc delete pytorchjobs -n {namespace} --all") @@ -237,7 +237,7 @@ def prepare_namespace(test_settings): prepare_kueue_queue(False, namespace, local_kueue_name) -def scale_up_sutest(): +def scale_up(): if config.project.get_config("clusters.sutest.is_metal"): return @@ -262,10 +262,6 @@ def cleanup_cluster(): cleanup_rhoai() -def cleanup_sutest_ns(): - cleanup_namespace_test() - - def cleanup_sutest_ns(): namespace = config.project.get_config("tests.fine_tuning.namespace") # do not delete it ... (to save the PVC) @@ -282,7 +278,12 @@ def cluster_scale_down(to_zero): logging.info(f"No {machineset_name} machineset. Nothing to scale down.") return - replicas = 0 if to_zero else 1 + if to_zero: + replicas = 0 + else: + replicas = config.project.get_config("clusters.sutest.compute.machineset.rest_count") + if replicas is None: replicas = 1 + run.run(f"oc scale --replicas={replicas} machineset/{machineset_name} -n openshift-machine-api") diff --git a/projects/fine_tuning/testing/test_finetuning.py b/projects/fine_tuning/testing/test_finetuning.py index c67234310..e430db392 100644 --- a/projects/fine_tuning/testing/test_finetuning.py +++ b/projects/fine_tuning/testing/test_finetuning.py @@ -245,6 +245,7 @@ def _run_test_quality_evaluation(test_settings): def _run_test_and_visualize(test_override_values=None): failed = True + dry_mode = config.project.get_config("tests.dry_mode") do_matbenchmarking = test_override_values is None and config.project.get_config("tests.fine_tuning.matbenchmarking.enabled") do_multi_model = config.project.get_config("tests.fine_tuning.multi_model.enabled") @@ -285,6 +286,10 @@ def _run_test_and_visualize(test_override_values=None): logging.error(msg) raise RuntimeError(msg) + if not dry_mode: + with env.NextArtifactDir("prepare_nodes"): + prepare_finetuning.scale_up_sutest() + test_artifact_dir_p = [None] try: if do_multi_model: @@ -302,7 +307,9 @@ def _run_test_and_visualize(test_override_values=None): failed = _run_test(test_artifact_dir_p, test_override_values) finally: - dry_mode = config.project.get_config("tests.dry_mode") + if not dry_mode: + prepare_finetuning.scale_down() + if not config.project.get_config("tests.visualize"): logging.info(f"Visualization disabled.")