From b90d7fd251037031f11793b8c62e9ed2f1983208 Mon Sep 17 00:00:00 2001 From: abhijeet-dhumal Date: Mon, 23 Dec 2024 11:57:20 +0530 Subject: [PATCH] Added resources list for CPU based usecase, updated requirements.txt to resolve fsspec/numpy package compatibility issue and added license in MNIST script --- tests/kfto/kfto_mnist_training_test.go | 26 +++++++++++++++++----- tests/kfto/resources/mnist.py | 14 ++++++++++++ tests/kfto/resources/requirements-rocm.txt | 4 +++- tests/kfto/resources/requirements.txt | 4 +++- 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go index 63596c07..ce501a9d 100644 --- a/tests/kfto/kfto_mnist_training_test.go +++ b/tests/kfto/kfto_mnist_training_test.go @@ -31,7 +31,7 @@ import ( ) func TestPyTorchJobMnistCpu(t *testing.T) { - runKFTOPyTorchMnistJob(t, 0, 2, "", GetCudaTrainingImage(), "resources/requirements.txt") + runKFTOPyTorchMnistJob(t, 0, 1, "", GetCudaTrainingImage(), "resources/requirements.txt") } func TestPyTorchJobMnistWithCuda(t *testing.T) { @@ -255,18 +255,18 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config } if useGPU { - // Update resource lists + // Update resource lists for GPU (NVIDIA/ROCm) usecase tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("8Gi"), + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("6Gi"), corev1.ResourceName(gpuLabel): resource.MustParse(fmt.Sprint(numGpus)), }, } tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("8Gi"), + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("6Gi"), corev1.ResourceName(gpuLabel): resource.MustParse(fmt.Sprint(numGpus)), }, } @@ -284,6 +284,20 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config Operator: corev1.TolerationOpExists, }, } + } else { + // Update resource lists for CPU usecase + tuningJob.Spec.PyTorchReplicaSpecs["Master"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("6Gi"), + }, + } + tuningJob.Spec.PyTorchReplicaSpecs["Worker"].Template.Spec.Containers[0].Resources = corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("6Gi"), + }, + } } tuningJob, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Create(test.Ctx(), tuningJob, metav1.CreateOptions{}) diff --git a/tests/kfto/resources/mnist.py b/tests/kfto/resources/mnist.py index 7d8d445d..5853e376 100644 --- a/tests/kfto/resources/mnist.py +++ b/tests/kfto/resources/mnist.py @@ -1,3 +1,17 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import os diff --git a/tests/kfto/resources/requirements-rocm.txt b/tests/kfto/resources/requirements-rocm.txt index 1880dc8f..0c521411 100644 --- a/tests/kfto/resources/requirements-rocm.txt +++ b/tests/kfto/resources/requirements-rocm.txt @@ -1,3 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/rocm6.1 torchvision==0.19.0 -tensorboard==2.18.0 \ No newline at end of file +tensorboard==2.18.0 +fsspec[http]==2024.6.1 +numpy<2.1,>=1.22 \ No newline at end of file diff --git a/tests/kfto/resources/requirements.txt b/tests/kfto/resources/requirements.txt index e3ae7b3e..8c14939b 100644 --- a/tests/kfto/resources/requirements.txt +++ b/tests/kfto/resources/requirements.txt @@ -1,2 +1,4 @@ torchvision==0.19.0 -tensorboard==2.18.0 \ No newline at end of file +tensorboard==2.18.0 +fsspec[http]==2024.6.1 +numpy<2.1,>=1.22 \ No newline at end of file