Skip to content

Commit

Permalink
[fine_tuning] Add Cluster scale up for Ilab testing (#595)
Browse files Browse the repository at this point in the history
  • Loading branch information
kpouget authored Nov 25, 2024
2 parents e876c71 + ddbf811 commit d22cc7b
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 67 deletions.
7 changes: 5 additions & 2 deletions docs/toolbox.generated/Fine_Tuning.run_fine_tuning_job.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ Parameters

* The delimiter marking the beginning of the response in the dataset samples

* default value: ``\n### Label:``


``container_image``

Expand Down Expand Up @@ -143,3 +141,8 @@ Parameters

* If true, sleeps forever instead of running the fine-tuning command.


``ephemeral_output_pvc_size``

* If a size (with units) is passed, use an ephemeral volume claim for storing the fine-tuning output. Otherwise, use an emptyDir.

6 changes: 3 additions & 3 deletions projects/fine_tuning/testing/command_args.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ sutest/cluster set_scale:
gpu_operator enable_time_sharing:
replicas: {{ gpu.time_sharing.replicas }}

sutest/cluster preload_image/kserve-runtime:
sutest/cluster preload_image:
namespace: "{{ tests.fine_tuning.namespace }}"
name: fine-tuning-image
{% if tests.fine_tuning.fms.enabled %}
image: {{ tests.fine_tuning.fms.image }}
{% elif tests.fine_tuning.ray.enabled %}
image: {{ tests.fine_tuning.ray.enabled }}
image: {{ tests.fine_tuning.ray.image }}
{% elif tests.fine_tuning.ilab.enabled %}
image: {{ tests.fine_tuning.ilab.enabled }}
image: {{ tests.fine_tuning.ilab.image }}
{% else %}
image: invalid_configuration
{% endif %}
Expand Down
67 changes: 42 additions & 25 deletions projects/fine_tuning/testing/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ ci_presets:

light:
extends: [light_cluster]
clusters.sutest.compute.dedicated: false
tests.fine_tuning.matbenchmarking.enabled: false
tests.fine_tuning.test_settings.gpu: null
tests.fine_tuning.test_settings.dataset_replication: 1
Expand All @@ -26,6 +27,7 @@ ci_presets:
gpu:
gpu.prepare_cluster: true
clusters.sutest.compute.machineset.type: g4dn.2xlarge
clusters.sutest.compute.machineset.count: 1
tests.fine_tuning.test_settings.gpu: 1

# ---
Expand Down Expand Up @@ -266,13 +268,10 @@ ci_presets:

metal:
clusters.sutest.is_metal: true
clusters.driver.is_metal: true
clusters.sutest.compute.dedicated: false
clusters.driver.compute.dedicated: false

not_metal:
clusters.sutest.is_metal: false
clusters.driver.is_metal: false

use_intlab_os:
matbench.lts.opensearch.index_prefix: "psap-rhoai."
Expand Down Expand Up @@ -307,12 +306,17 @@ ci_presets:
#

fms:
tests.fine_tuning.fms.enabled: false
tests.fine_tuning.fms.enabled: true
matbench.workload: projects.fine_tuning.visualizations.fms_hf_tuning
matbench.prom_workload: projects.fine_tuning.visualizations.fms_prom
fine_tuning.default_response_template: "\n### Label:"

ray:
not_fms:
tests.fine_tuning.fms.enabled: false
fine_tuning.default_response_template: null

ray:
extends: [not_fms]
tests.fine_tuning.ray.enabled: true
tests.capture_prom: false # not needed for the time being
tests.fine_tuning.test_settings.hyper_parameters: {}
Expand Down Expand Up @@ -341,9 +345,14 @@ ci_presets:
# ---

ilab:
ci_presets.light["tests.fine_tuning.test_settings.dataset_name"]: sdg_data.jsonl
extends: [not_fms, gpu]
ci_presets.light["tests.fine_tuning.test_settings.dataset_name"]: ilab_skills_data.jsonl
ci_presets.light["tests.fine_tuning.test_settings.model_name"]: ibm-granite/granite-3b-code-instruct@hf
tests.fine_tuning.fms.enabled: false
ci_presets.light["tests.fine_tuning.test_settings.gpu"]: 1
ci_presets.gpu["clusters.sutest.compute.machineset.type"]: g5.xlarge
clusters.sutest.compute.machineset.type: g5.xlarge


tests.fine_tuning.ilab.enabled: true
tests.fine_tuning.test_settings.name: ilab
tests.fine_tuning.test_settings.dataset_name: ilab_skills_data.jsonl
Expand All @@ -360,20 +369,27 @@ ci_presets:
extends: [ilab]

tests.fine_tuning.test_settings.model_name: ibm-granite/granite-7b-base@hf
tests.fine_tuning.test_settings.dataset_name: [ilab_skills_data.jsonl, ilab_knowledge_data.jsonl]
tests.fine_tuning.test_settings.dataset_name: [ilab_large_10000samples_skills_data.jsonl, ilab_large_knowledge_data.jsonl]

tests.fine_tuning.test_settings.pod_count: 2
tests.fine_tuning.test_settings.gpu: 1

tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 8
tests.fine_tuning.test_settings.pod_count: [1, 2, 4]
tests.fine_tuning.test_settings.gpu: 2
clusters.sutest.compute.machineset.count: 4
tests.fine_tuning.test_settings.hyper_parameters.num_epochs: 1
tests.fine_tuning.matbenchmarking.enabled: true
tests.fine_tuning.matbenchmarking.stop_on_error: true

# ---

cluster_instructlab:
clusters.sutest.compute.machineset.type: gx3-48x240x2l40s
fine_tuning.pvc.storage_class_name: nfs-csi
fine_tuning.pvc.access_mode: ReadWriteMany
fine_tuning.pvc.size: 500Gi
clusters.sutest.is_metal: false
clusters.sutest.compute.machineset.type: gx3-48x240x2l40s
clusters.sutest.compute.machineset.name: instructlab-standalon-6rjg8-worker-1
clusters.sutest.compute.machineset.taint: null
clusters.sutest.compute.machineset.rest_count: 1
tests.fine_tuning.test_settings.ephemeral_output_pvc_size: 500Gi

cluster_ibm_dgx:
clusters.sutest.compute.machineset.type: "IBM-DGX A100-80GB"
Expand Down Expand Up @@ -445,22 +461,11 @@ clusters:
name: workload-pods
type: m6i.2xlarge
count: null
rest_count: 0
taint:
key: only-workload-pods
value: "yes"
effect: NoSchedule
driver:
is_metal: false
compute:
dedicated: true
machineset:
name: test-pods
count: null
type: m6i.2xlarge
taint:
key: only-test-pods
value: "yes"
effect: NoSchedule
cleanup_on_exit: false

rhods:
Expand Down Expand Up @@ -488,6 +493,8 @@ fine_tuning:
size: 80Gi
storage_class_name: null
model_registry: null # if set to a fine_tuning_sources.* model registry, all the lookups will be done in this registry
default_response_template: "\n### Label:"

sources:
dmf:
type: model-registry
Expand Down Expand Up @@ -515,6 +522,8 @@ fine_tuning:
source_dir: /manually/populated
response_template: "\n### Response:"

# ---

ilab_skills_data.jsonl:
type: dataset
source_dir: 's3://instructlab-standalone/data'
Expand All @@ -530,11 +539,18 @@ fine_tuning:
source_dir: 's3://instructlab-standalone/data'
secret_key: secrets.aws_credentials

ilab_large_10000samples_skills_data.jsonl:
type: dataset
source_dir: 's3://instructlab-standalone/data'
secret_key: secrets.aws_credentials

ilab_large_knowledge_data.jsonl:
type: dataset
source_dir: 's3://instructlab-standalone/data'
secret_key: secrets.aws_credentials

# ---

alpaca_data.json:
type: dataset
source_dir: 'https://raw.githubusercontent.com/gururise/AlpacaDataCleaned/main'
Expand Down Expand Up @@ -569,6 +585,7 @@ tests:
dataset_replication: 1
pod_count: 1
container_image: null
ephemeral_output_pvc_size: null
# ---
# https://huggingface.co/transformers/v3.0.2/main_classes/trainer.html
hyper_parameters:
Expand Down
34 changes: 21 additions & 13 deletions projects/fine_tuning/testing/prepare_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
def prepare():
with run.Parallel("prepare1") as parallel:
parallel.delayed(prepare_rhoai)
parallel.delayed(scale_up_sutest)
parallel.delayed(cluster_scale_up)


test_settings = config.project.get_config("tests.fine_tuning.test_settings")
Expand Down Expand Up @@ -220,9 +220,9 @@ def prepare_namespace(test_settings):
with env.NextArtifactDir("prepare_namespace"):
set_namespace_annotations()

with run.Parallel("download") as parallel:
parallel.delayed(download_data_sources, test_settings)
parallel.delayed(preload_image)
with run.Parallel("prepare_data") as parallel:
parallel.delayed(download_data_sources, test_settings)
parallel.delayed(preload_image)

if not dry_mode:
run.run(f"oc delete pytorchjobs -n {namespace} --all")
Expand All @@ -237,18 +237,21 @@ def prepare_namespace(test_settings):
prepare_kueue_queue(False, namespace, local_kueue_name)


def scale_up_sutest():
def cluster_scale_up(wait_gpu=False):
if config.project.get_config("clusters.sutest.is_metal"):
return

node_count = config.project.get_config("clusters.sutest.compute.machineset.count")

if node_count is None:
node_count = 1
logging.info("clusters.sutest.compute.machineset.count isn't set. Not touching the cluster scale.")
return

extra = dict(scale=node_count)
run.run_toolbox_from_config("cluster", "set_scale", prefix="sutest", extra=extra, artifact_dir_suffix="_sutest")

if wait_gpu:
prepare_gpu_operator.wait_ready(enable_time_sharing=False, wait_stack_deployed=True, wait_metrics=False)

def cleanup_rhoai(mute=True):
prepare_rhoai_mod.uninstall(mute)
Expand All @@ -262,27 +265,32 @@ def cleanup_cluster():
cleanup_rhoai()


def cleanup_sutest_ns():
cleanup_namespace_test()


def cleanup_sutest_ns():
namespace = config.project.get_config("tests.fine_tuning.namespace")
# do not delete it ... (to save the PVC)
# empty the namespace


def cluster_scale_down(to_zero):
def cluster_scale_down(to_zero=None):
if config.project.get_config("clusters.sutest.is_metal"):
return

if config.project.get_config("clusters.sutest.compute.machineset.count") is None:
logging.info("clusters.sutest.compute.machineset.count isn't set. Not touching the cluster scale.")
return

machineset_name = config.project.get_config("clusters.sutest.compute.machineset.name")
has_machineset = run.run(f"oc get machineset {machineset_name} -n openshift-machine-api -oname --ignore-not-found", capture_stdout=True).stdout
if not has_machineset:
logging.info(f"No {machineset_name} machineset. Nothing to scale down.")
return

replicas = 0 if to_zero else 1
if to_zero:
replicas = 0
else:
replicas = config.project.get_config("clusters.sutest.compute.machineset.rest_count")
if replicas is None: replicas = 1

run.run(f"oc scale --replicas={replicas} machineset/{machineset_name} -n openshift-machine-api")


Expand All @@ -297,6 +305,6 @@ def preload_image():

break
except Exception:
logging.warning(f"Preloading of '{image}' try #{i+1}/{RETRIES} failed :/")
logging.warning(f"Image preloading try #{i+1}/{RETRIES} failed :/")
if i+1 == RETRIES:
raise
11 changes: 10 additions & 1 deletion projects/fine_tuning/testing/test_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ def _run_test(test_artifact_dir_p, test_override_values, job_index=None):

if (response_template := dataset_source.get("response_template")) is not None:
test_settings["dataset_response_template"] = response_template
elif (default_response_template := config.project.get_config("fine_tuning.default_response_template")) is not None:
test_settings["dataset_response_template"] = default_response_template

remove_none_values(test_settings)

Expand Down Expand Up @@ -245,6 +247,7 @@ def _run_test_quality_evaluation(test_settings):

def _run_test_and_visualize(test_override_values=None):
failed = True
dry_mode = config.project.get_config("tests.dry_mode")
do_matbenchmarking = test_override_values is None and config.project.get_config("tests.fine_tuning.matbenchmarking.enabled")
do_multi_model = config.project.get_config("tests.fine_tuning.multi_model.enabled")

Expand Down Expand Up @@ -285,6 +288,10 @@ def _run_test_and_visualize(test_override_values=None):
logging.error(msg)
raise RuntimeError(msg)

if not dry_mode:
with env.NextArtifactDir("prepare_nodes"):
prepare_finetuning.cluster_scale_up(wait_gpu=True)

test_artifact_dir_p = [None]
try:
if do_multi_model:
Expand All @@ -302,7 +309,9 @@ def _run_test_and_visualize(test_override_values=None):
failed = _run_test(test_artifact_dir_p, test_override_values)

finally:
dry_mode = config.project.get_config("tests.dry_mode")
if not dry_mode:
prepare_finetuning.cluster_scale_down()

if not config.project.get_config("tests.visualize"):
logging.info(f"Visualization disabled.")

Expand Down
6 changes: 5 additions & 1 deletion projects/fine_tuning/toolbox/fine_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def run_fine_tuning_job(
dataset_transform=None,
dataset_prefer_cache=True,
dataset_prepare_cache_only=False,
dataset_response_template="\n### Label:",
dataset_response_template=None,
container_image="quay.io/modh/fms-hf-tuning:release-7a8ff0f4114ba43398d34fd976f6b17bb1f665f3",

gpu=0,
Expand All @@ -44,6 +44,8 @@ def run_fine_tuning_job(

capture_artifacts=True,
sleep_forever=False,

ephemeral_output_pvc_size=None,
):
"""
Run a simple fine-tuning Job.
Expand Down Expand Up @@ -77,6 +79,8 @@ def run_fine_tuning_job(
capture_artifacts: if enabled, captures the artifacts that will help post-mortem analyses
sleep_forever: if true, sleeps forever instead of running the fine-tuning command.
ephemeral_output_pvc_size: if a size (with units) is passed, use an ephemeral volume claim for storing the fine-tuning output. Otherwise, use an emptyDir.
"""

return RunAnsibleRole(locals())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ fine_tuning_run_fine_tuning_job_dataset_prefer_cache: true
fine_tuning_run_fine_tuning_job_dataset_prepare_cache_only: false

# the delimiter marking the beginning of the response in the dataset samples
fine_tuning_run_fine_tuning_job_dataset_response_template: '
### Label:'
fine_tuning_run_fine_tuning_job_dataset_response_template: null

# the image to use for the fine-tuning container
fine_tuning_run_fine_tuning_job_container_image: quay.io/modh/fms-hf-tuning:release-7a8ff0f4114ba43398d34fd976f6b17bb1f665f3
Expand Down Expand Up @@ -76,3 +74,6 @@ fine_tuning_run_fine_tuning_job_capture_artifacts: true

# if true, sleeps forever instead of running the fine-tuning command.
fine_tuning_run_fine_tuning_job_sleep_forever: false

# if a size (with units) is passed, use an ephemeral volume claim for storing the fine-tuning output. Otherwise, use an emptyDir.
fine_tuning_run_fine_tuning_job_ephemeral_output_pvc_size: null
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@
-n {{ fine_tuning_run_fine_tuning_job_namespace }}
--no-headers | awk '{print $3}'
register: wait_pod_start
retries: 720
retries: 9999
delay: 30
until: wait_pod_start.stdout != "Running"

Expand Down
Loading

0 comments on commit d22cc7b

Please sign in to comment.