Skip to content

Commit 40d469c

Browse files
committed
[fine_tuning] testing: add cluster scale up in the Ilab-on-RHOAI training
1 parent 3c22f10 commit 40d469c

File tree

4 files changed

+27
-30
lines changed

4 files changed

+27
-30
lines changed

projects/fine_tuning/testing/command_args.yml.j2

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ sutest/cluster set_scale:
2020
gpu_operator enable_time_sharing:
2121
replicas: {{ gpu.time_sharing.replicas }}
2222

23-
sutest/cluster preload_image/kserve-runtime:
23+
sutest/cluster preload_image:
2424
namespace: "{{ tests.fine_tuning.namespace }}"
2525
name: fine-tuning-image
2626
{% if tests.fine_tuning.fms.enabled %}

projects/fine_tuning/testing/config.yaml

+6-17
Original file line numberDiff line numberDiff line change
@@ -266,13 +266,10 @@ ci_presets:
266266

267267
metal:
268268
clusters.sutest.is_metal: true
269-
clusters.driver.is_metal: true
270269
clusters.sutest.compute.dedicated: false
271-
clusters.driver.compute.dedicated: false
272270

273271
not_metal:
274272
clusters.sutest.is_metal: false
275-
clusters.driver.is_metal: false
276273

277274
use_intlab_os:
278275
matbench.lts.opensearch.index_prefix: "psap-rhoai."
@@ -364,16 +361,19 @@ ci_presets:
364361

365362
tests.fine_tuning.test_settings.pod_count: 2
366363
tests.fine_tuning.test_settings.gpu: 1
367-
364+
clusters.sutest.compute.machineset.count: 4
368365
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 8
369366
tests.fine_tuning.matbenchmarking.enabled: true
370367
tests.fine_tuning.matbenchmarking.stop_on_error: true
371368

372369
# ---
373370

374371
cluster_instructlab:
375-
clusters.sutest.compute.machineset.type: gx3-48x240x2l40s
376372
fine_tuning.pvc.storage_class_name: nfs-csi
373+
clusters.sutest.is_metal: false
374+
clusters.sutest.compute.machineset.rest_count: 1
375+
clusters.sutest.compute.machineset.type: gx3-48x240x2l40s
376+
clusters.sutest.compute.machineset.name: instructlab-standalon-6rjg8-worker-1
377377

378378
cluster_ibm_dgx:
379379
clusters.sutest.compute.machineset.type: "IBM-DGX A100-80GB"
@@ -445,22 +445,11 @@ clusters:
445445
name: workload-pods
446446
type: m6i.2xlarge
447447
count: null
448+
rest_count: null
448449
taint:
449450
key: only-workload-pods
450451
value: "yes"
451452
effect: NoSchedule
452-
driver:
453-
is_metal: false
454-
compute:
455-
dedicated: true
456-
machineset:
457-
name: test-pods
458-
count: null
459-
type: m6i.2xlarge
460-
taint:
461-
key: only-test-pods
462-
value: "yes"
463-
effect: NoSchedule
464453
cleanup_on_exit: false
465454

466455
rhods:

projects/fine_tuning/testing/prepare_finetuning.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
def prepare():
1414
with run.Parallel("prepare1") as parallel:
1515
parallel.delayed(prepare_rhoai)
16-
parallel.delayed(scale_up_sutest)
16+
parallel.delayed(scale_up)
1717

1818

1919
test_settings = config.project.get_config("tests.fine_tuning.test_settings")
@@ -220,9 +220,9 @@ def prepare_namespace(test_settings):
220220
with env.NextArtifactDir("prepare_namespace"):
221221
set_namespace_annotations()
222222

223-
with run.Parallel("download") as parallel:
224-
parallel.delayed(download_data_sources, test_settings)
225-
parallel.delayed(preload_image)
223+
with run.Parallel("prepare_data") as parallel:
224+
parallel.delayed(download_data_sources, test_settings)
225+
parallel.delayed(preload_image)
226226

227227
if not dry_mode:
228228
run.run(f"oc delete pytorchjobs -n {namespace} --all")
@@ -237,7 +237,7 @@ def prepare_namespace(test_settings):
237237
prepare_kueue_queue(False, namespace, local_kueue_name)
238238

239239

240-
def scale_up_sutest():
240+
def scale_up():
241241
if config.project.get_config("clusters.sutest.is_metal"):
242242
return
243243

@@ -262,10 +262,6 @@ def cleanup_cluster():
262262
cleanup_rhoai()
263263

264264

265-
def cleanup_sutest_ns():
266-
cleanup_namespace_test()
267-
268-
269265
def cleanup_sutest_ns():
270266
namespace = config.project.get_config("tests.fine_tuning.namespace")
271267
# do not delete it ... (to save the PVC)
@@ -282,7 +278,12 @@ def cluster_scale_down(to_zero):
282278
logging.info(f"No {machineset_name} machineset. Nothing to scale down.")
283279
return
284280

285-
replicas = 0 if to_zero else 1
281+
if to_zero:
282+
replicas = 0
283+
else:
284+
replicas = config.project.get_config("clusters.sutest.compute.machineset.rest_count")
285+
if replicas is None: replicas = 1
286+
286287
run.run(f"oc scale --replicas={replicas} machineset/{machineset_name} -n openshift-machine-api")
287288

288289

projects/fine_tuning/testing/test_finetuning.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def _run_test(test_artifact_dir_p, test_override_values, job_index=None):
9999

100100
remove_none_values(test_settings)
101101

102-
prepare_finetuning.prepare_namespace(test_settings)
102+
prepare_finetuning.prepare_namespace(test_settings)s
103103
failed = True
104104

105105
_start_ts = datetime.datetime.now()
@@ -245,6 +245,7 @@ def _run_test_quality_evaluation(test_settings):
245245

246246
def _run_test_and_visualize(test_override_values=None):
247247
failed = True
248+
dry_mode = config.project.get_config("tests.dry_mode")
248249
do_matbenchmarking = test_override_values is None and config.project.get_config("tests.fine_tuning.matbenchmarking.enabled")
249250
do_multi_model = config.project.get_config("tests.fine_tuning.multi_model.enabled")
250251

@@ -285,6 +286,10 @@ def _run_test_and_visualize(test_override_values=None):
285286
logging.error(msg)
286287
raise RuntimeError(msg)
287288

289+
if not dry_mode:
290+
with env.NextArtifactDir("prepare_nodes"):
291+
prepare_finetuning.scale_up_sutest()
292+
288293
test_artifact_dir_p = [None]
289294
try:
290295
if do_multi_model:
@@ -302,7 +307,9 @@ def _run_test_and_visualize(test_override_values=None):
302307
failed = _run_test(test_artifact_dir_p, test_override_values)
303308

304309
finally:
305-
dry_mode = config.project.get_config("tests.dry_mode")
310+
if not dry_mode:
311+
prepare_finetuning.scale_down()
312+
306313
if not config.project.get_config("tests.visualize"):
307314
logging.info(f"Visualization disabled.")
308315

0 commit comments

Comments
 (0)