Skip to content

Commit

Permalink
[fine_tuning] testing: add cluster scale up in the Ilab-on-RHOAI trai…
Browse files Browse the repository at this point in the history
…ning
  • Loading branch information
kpouget committed Nov 22, 2024
1 parent 3c22f10 commit 0081588
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 29 deletions.
2 changes: 1 addition & 1 deletion projects/fine_tuning/testing/command_args.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ sutest/cluster set_scale:
gpu_operator enable_time_sharing:
replicas: {{ gpu.time_sharing.replicas }}

sutest/cluster preload_image/kserve-runtime:
sutest/cluster preload_image:
namespace: "{{ tests.fine_tuning.namespace }}"
name: fine-tuning-image
{% if tests.fine_tuning.fms.enabled %}
Expand Down
23 changes: 6 additions & 17 deletions projects/fine_tuning/testing/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,10 @@ ci_presets:

metal:
clusters.sutest.is_metal: true
clusters.driver.is_metal: true
clusters.sutest.compute.dedicated: false
clusters.driver.compute.dedicated: false

not_metal:
clusters.sutest.is_metal: false
clusters.driver.is_metal: false

use_intlab_os:
matbench.lts.opensearch.index_prefix: "psap-rhoai."
Expand Down Expand Up @@ -364,16 +361,19 @@ ci_presets:

tests.fine_tuning.test_settings.pod_count: 2
tests.fine_tuning.test_settings.gpu: 1

clusters.sutest.compute.machineset.count: 4
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 8
tests.fine_tuning.matbenchmarking.enabled: true
tests.fine_tuning.matbenchmarking.stop_on_error: true

# ---

cluster_instructlab:
clusters.sutest.compute.machineset.type: gx3-48x240x2l40s
fine_tuning.pvc.storage_class_name: nfs-csi
clusters.sutest.is_metal: false
clusters.sutest.compute.machineset.rest_count: 1
clusters.sutest.compute.machineset.type: gx3-48x240x2l40s
clusters.sutest.compute.machineset.name: instructlab-standalon-6rjg8-worker-1

cluster_ibm_dgx:
clusters.sutest.compute.machineset.type: "IBM-DGX A100-80GB"
Expand Down Expand Up @@ -445,22 +445,11 @@ clusters:
name: workload-pods
type: m6i.2xlarge
count: null
rest_count: null
taint:
key: only-workload-pods
value: "yes"
effect: NoSchedule
driver:
is_metal: false
compute:
dedicated: true
machineset:
name: test-pods
count: null
type: m6i.2xlarge
taint:
key: only-test-pods
value: "yes"
effect: NoSchedule
cleanup_on_exit: false

rhods:
Expand Down
21 changes: 11 additions & 10 deletions projects/fine_tuning/testing/prepare_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
def prepare():
with run.Parallel("prepare1") as parallel:
parallel.delayed(prepare_rhoai)
parallel.delayed(scale_up_sutest)
parallel.delayed(scale_up)


test_settings = config.project.get_config("tests.fine_tuning.test_settings")
Expand Down Expand Up @@ -220,9 +220,9 @@ def prepare_namespace(test_settings):
with env.NextArtifactDir("prepare_namespace"):
set_namespace_annotations()

with run.Parallel("download") as parallel:
parallel.delayed(download_data_sources, test_settings)
parallel.delayed(preload_image)
with run.Parallel("prepare_data") as parallel:
parallel.delayed(download_data_sources, test_settings)
parallel.delayed(preload_image)

if not dry_mode:
run.run(f"oc delete pytorchjobs -n {namespace} --all")
Expand All @@ -237,7 +237,7 @@ def prepare_namespace(test_settings):
prepare_kueue_queue(False, namespace, local_kueue_name)


def scale_up_sutest():
def scale_up():
if config.project.get_config("clusters.sutest.is_metal"):
return

Expand All @@ -262,10 +262,6 @@ def cleanup_cluster():
cleanup_rhoai()


def cleanup_sutest_ns():
cleanup_namespace_test()


def cleanup_sutest_ns():
namespace = config.project.get_config("tests.fine_tuning.namespace")
# do not delete it ... (to save the PVC)
Expand All @@ -282,7 +278,12 @@ def cluster_scale_down(to_zero):
logging.info(f"No {machineset_name} machineset. Nothing to scale down.")
return

replicas = 0 if to_zero else 1
if to_zero:
replicas = 0
else:
replicas = config.project.get_config("clusters.sutest.compute.machineset.rest_count")
if replicas is None: replicas = 1

run.run(f"oc scale --replicas={replicas} machineset/{machineset_name} -n openshift-machine-api")


Expand Down
9 changes: 8 additions & 1 deletion projects/fine_tuning/testing/test_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ def _run_test_quality_evaluation(test_settings):

def _run_test_and_visualize(test_override_values=None):
failed = True
dry_mode = config.project.get_config("tests.dry_mode")
do_matbenchmarking = test_override_values is None and config.project.get_config("tests.fine_tuning.matbenchmarking.enabled")
do_multi_model = config.project.get_config("tests.fine_tuning.multi_model.enabled")

Expand Down Expand Up @@ -285,6 +286,10 @@ def _run_test_and_visualize(test_override_values=None):
logging.error(msg)
raise RuntimeError(msg)

if not dry_mode:
with env.NextArtifactDir("prepare_nodes"):
prepare_finetuning.scale_up_sutest()

test_artifact_dir_p = [None]
try:
if do_multi_model:
Expand All @@ -302,7 +307,9 @@ def _run_test_and_visualize(test_override_values=None):
failed = _run_test(test_artifact_dir_p, test_override_values)

finally:
dry_mode = config.project.get_config("tests.dry_mode")
if not dry_mode:
prepare_finetuning.scale_down()

if not config.project.get_config("tests.visualize"):
logging.info(f"Visualization disabled.")

Expand Down

0 comments on commit 0081588

Please sign in to comment.