Skip to content

Commit

Permalink
Address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Scott Davidson committed Oct 16, 2023
1 parent dc27d35 commit 7ad735f
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 54 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
- [RDMA Bandwidth](#rdma-bandwidth)
- [RDMA Latency](#rdma-latency)
- [fio](#fio)
- [Pytorch](#Pytorch)
- [PyTorch](#PyTorch)
- [Operator development](#operator-development)

## Installation
Expand Down Expand Up @@ -290,10 +290,10 @@ spec:
storage: 5Gi
```

### Pytorch
### PyTorch

Runs machine learning model training and inference micro-benchmarks from the official
Pytorch [benchmarks repo](https://github.com/pytorch/benchmark/) to compare performance
PyTorch [benchmarks repo](https://github.com/pytorch/benchmark/) to compare performance
of CPU and GPU devices on synthetic input data. Running benchmarks on CUDA-capable
devices requires the [Nvidia GPU Operator](https://github.com/NVIDIA/gpu-operator)
to be pre-installed on the target Kubernetes cluster.
Expand All @@ -302,11 +302,11 @@ The pre-built container image currently includes the `alexnet`, `resnet50` and
`llama` (inference only) models - additional models from the
[upstream repo list](https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models)
may be added as needed in the future. (Adding a new model simply requires adding it to the list
in `images/pytorch-benchmark/Dockerfile` and updating the `PytorchModel` enum in `pytorch.py`.)
in `images/pytorch-benchmark/Dockerfile` and updating the `PyTorchModel` enum in `pytorch.py`.)

```yaml
apiVersion: perftest.stackhpc.com/v1alpha1
kind: Pytorch
kind: PyTorch
metadata:
name: pytorch-test-gpu
spec:
Expand Down
90 changes: 48 additions & 42 deletions python/perftest/models/v1alpha1/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,23 @@ class Device(str, schema.Enum):
CUDA = "cuda"

# List of models here should match list in images/pytorch-benchmark/Dockerfile
class PytorchModel(str, schema.Enum):
class PyTorchModel(str, schema.Enum):
"""
Enumeration of available models for benchmarking.
"""
ALEXNET = "alexnet"
RESNET50 = "resnet50"
LLAMA = "llama"

class PytorchBenchmarkType(str, schema.Enum):
class PyTorchBenchmarkType(str, schema.Enum):
"""
Enumeration of model processes available to benchmark.
"""
TRAIN = "train"
EVAL = "eval"


class PytorchSpec(base.BenchmarkSpec):
class PyTorchSpec(base.BenchmarkSpec):
"""
Defines the parameters for the Fio benchmark.
"""
Expand All @@ -56,31 +56,31 @@ class PytorchSpec(base.BenchmarkSpec):
base.ImagePullPolicy.IF_NOT_PRESENT,
description = "The pull policy for the image."
)
# Pytorch benchmark config options
# PyTorch benchmark config options
device: Device = Field(
Device.CPU,
description = "The device to run the ML workload."
description = (
"The device to run the ML workload."
"If device is 'cuda' then you must also make a request for GPU resources by"
"adding a 'nvidia.com/gpu: <gpu-count>' field to benchmark.spec.resources.limits"
)
)
model: PytorchModel = Field(
model: PyTorchModel = Field(
description = "The ML model to benchmark."
)
benchmark_type: PytorchBenchmarkType = Field(
PytorchBenchmarkType.EVAL,
benchmark_type: PyTorchBenchmarkType = Field(
PyTorchBenchmarkType.EVAL,
description = "Whether to benchmark the training or inference (eval) process."
)
input_batch_size: conint(multiple_of=2, ge=2) = Field(
64,
description = "The batch size for the generated model input data.",
)
gpu_count: t.Optional[conint(ge=1)] = Field(
None,
description = "Number of GPUs to request for the benchmark run. Defaults to 0 for device = cpu and 1 for device = cuda."
)



class PytorchResult(schema.BaseModel):
class PyTorchResult(schema.BaseModel):
"""
Represents an individual Pytorch benchmark result.
Represents an individual PyTorch benchmark result.
Some notes on the inner workings of the pytorch benchmark script:
- Currently only runs one batch for benchmark so 'time per batch' in pytorch output == total time.
Expand All @@ -94,15 +94,15 @@ class PytorchResult(schema.BaseModel):
...,
description = "The CPU wall time (in seconds) as reported by the pytorch benchmark script."
)
peak_cpu_memory_GB: schema.confloat(ge = 0) = Field(
peak_cpu_memory: schema.confloat(ge = 0) = Field(
...,
description = "The peak CPU memory usage (in GB) reported by the pytorch benchmark script."
)
gpu_time: t.Optional[schema.confloat(ge = 0)] = Field(
None, # Default to zero for clearer reporting on cpu-only runs
description = "The GPU wall time (in seconds) reported by the pytorch benchmark script."
)
peak_gpu_memory_GB: t.Optional[schema.confloat(ge = 0)] = Field(
peak_gpu_memory: t.Optional[schema.confloat(ge = 0)] = Field(
None, # Default to zero for clearer reporting on cpu-only runs
description = "The peak GPU memory usage (in GB) reported by the pytorch benchmark script."
)
Expand All @@ -111,15 +111,15 @@ class PytorchResult(schema.BaseModel):
)


class PytorchStatus(base.BenchmarkStatus):
class PyTorchStatus(base.BenchmarkStatus):
"""
Represents the status of the Pytorch benchmark.
Represents the status of the PyTorch benchmark.
"""
gpu_count: conint(ge=0) = Field(
None,
description = "The number of gpus used in this benchmark"
)
result: t.Optional[PytorchResult] = Field(
result: t.Optional[PyTorchResult] = Field(
None,
description = "The result of the benchmark."
)
Expand Down Expand Up @@ -147,7 +147,7 @@ class PytorchStatus(base.BenchmarkStatus):
)


class Pytorch(
class PyTorch(
base.Benchmark,
subresources = {"status": {}},
printer_columns = [
Expand Down Expand Up @@ -204,14 +204,14 @@ class Pytorch(
]
):
"""
Custom resource for running an Pytorch benchmark.
Custom resource for running an PyTorch benchmark.
"""
spec: PytorchSpec = Field(
spec: PyTorchSpec = Field(
...,
description = "The parameters for the benchmark."
)
status: PytorchStatus = Field(
default_factory = PytorchStatus,
status: PyTorchStatus = Field(
default_factory = PyTorchStatus,
description = "The status of the benchmark."
)

Expand All @@ -220,11 +220,13 @@ async def pod_modified(
pod: t.Dict[str, t.Any],
fetch_pod_log: t.Callable[[], t.Awaitable[str]]
):
# Set default GPU count to display in status if none given in spec
if self.spec.gpu_count is None:
self.status.gpu_count = (0 if self.spec.device == "cpu" else 1)
# Parse GPU count from resources to display in status
if self.spec.resources:
if self.spec.resources.limits:
if 'nvidia.com/gpu' in self.spec.resources.limits.keys():
self.status.gpu_count = self.spec.resources.limits['nvidia.com/gpu']
else:
self.status.gpu_count = self.spec.gpu_count
self.status.gpu_count = 0

pod_phase = pod.get("status", {}).get("phase", "Unknown")
if pod_phase == "Running":
Expand All @@ -238,26 +240,30 @@ def summarise(self):
raise PodResultsIncompleteError("Pod has not recorded a result yet")

# Parse job output here
cpu_time = PYTORCH_CPU_TIME_REGEX.search(self.status.client_log).group('cpu_time')
cpu_time_units = PYTORCH_CPU_TIME_REGEX.search(self.status.client_log).group('cpu_time_units')
cpu_peak_memory = PYTORCH_CPU_MEMORY_REGEX.search(self.status.client_log).group('cpu_memory')
cpu_peak_memory_units = PYTORCH_CPU_MEMORY_REGEX.search(self.status.client_log).group('cpu_mem_units')
cpu_time_match = PYTORCH_CPU_TIME_REGEX.search(self.status.client_log)
cpu_time = cpu_time_match.group('cpu_time')
cpu_time_units = cpu_time_match.group('cpu_time_units')
cpu_memory_match = PYTORCH_CPU_MEMORY_REGEX.search(self.status.client_log)
cpu_peak_memory = cpu_memory_match.group('cpu_memory')
cpu_peak_memory_units = cpu_memory_match.group('cpu_mem_units')

if cpu_time_units != "milliseconds" or cpu_peak_memory_units != "GB":
raise PodLogFormatError(
"results output in unexpected units"
"results output in unexpected units - expected 'milliseconds' and 'GB'"
"(it's possible that results formatting has changed in upstream pytorch-benchmarks)"
)

if self.spec.device != "cpu":
# Parse GPU results
gpu_time = PYTORCH_GPU_TIME_REGEX.search(self.status.client_log).group('gpu_time')
gpu_peak_memory = PYTORCH_GPU_MEMORY_REGEX.search(self.status.client_log).group('gpu_memory')
gpu_time_units = PYTORCH_GPU_TIME_REGEX.search(self.status.client_log).group('gpu_time_units')
gpu_peak_memory_units = PYTORCH_GPU_MEMORY_REGEX.search(self.status.client_log).group('gpu_mem_units')
gpu_time_match = PYTORCH_GPU_TIME_REGEX.search(self.status.client_log)
gpu_time = gpu_time_match.group('gpu_time')
gpu_time_units = gpu_time_match.group('gpu_time_units')
gpu_memory_match = PYTORCH_GPU_MEMORY_REGEX.search(self.status.client_log)
gpu_peak_memory = gpu_memory_match.group('gpu_memory')
gpu_peak_memory_units = gpu_memory_match.group('gpu_mem_units')
if gpu_time_units != "milliseconds" or gpu_peak_memory_units != "GB":
raise PodLogFormatError(
"results output in unexpected units"
"results output in unexpected units - expected 'milliseconds' and 'GB'"
"(it's possible that results formatting has changed in upstream pytorch-benchmarks)"
)
# Convert times to seconds to match GNU time output
Expand All @@ -269,11 +275,11 @@ def summarise(self):
gnu_time_result = GnuTimeResult.parse(self.status.client_log)

# Convert times to seconds to match GNU time output
self.status.result = PytorchResult(
self.status.result = PyTorchResult(
pytorch_time = float(cpu_time) / 1000,
peak_cpu_memory_GB = cpu_peak_memory,
peak_cpu_memory = cpu_peak_memory,
gpu_time = gpu_time,
peak_gpu_memory_GB = gpu_peak_memory,
peak_gpu_memory = gpu_peak_memory,
gnu_time = gnu_time_result,
)

Expand Down
20 changes: 14 additions & 6 deletions python/perftest/templates/pytorch.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,21 @@ tasks:
image: {{ benchmark.spec.image }}
imagePullPolicy: {{ benchmark.spec.image_pull_policy }}
command: ["time"]
args: ["-v", "python3", "run.py", {{ benchmark.spec.model }}, "-t", {{ benchmark.spec.benchmark_type }}, "-d", {{ benchmark.spec.device }}, "--bs", "{{ benchmark.spec.input_batch_size}}"]
{% if benchmark.spec.device == "cuda" %}
args:
- -v
- python3
- run.py
- "{{ benchmark.spec.model }}"
- -t
- "{{ benchmark.spec.benchmark_type }}"
- -d
- "{{ benchmark.spec.device }}"
- --bs
- "{{ benchmark.spec.input_batch_size}}"
{%- if benchmark.spec.resources %}
resources:
limits:
# Default to single GPU for CUDA runs
nvidia.com/gpu: {{ benchmark.spec.gpu_count if benchmark.spec.gpu_count else 1 }}
{% endif %}
{{ benchmark.spec.resources | toyaml | indent(12) }}
{%- endif %}
# Avoid pods from other benchmarks
{{ macros.distribution_spread(benchmark) | indent(8) }}
{%- endcall %}
3 changes: 2 additions & 1 deletion python/perftest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ class GnuTimeResult(schema.BaseModel):
cpu_percentage: confloat(ge=0) = Field(description="The (peak) percentage of CPU used.")
wall_time_secs: confloat(ge=0) = Field(description="The wall clock time for this benchmark run.")

def parse(input: str):
@classmethod
def parse(cls, input: str):
match = GNU_TIME_EXTRACTION_REGEX.search(input)
if not match:
raise PodLogFormatError("failed to parse output of GNU time command")
Expand Down

0 comments on commit 7ad735f

Please sign in to comment.