tinygrad/.github/workflows/benchmark.yml at master · drmabus/tinygrad · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
name: Benchmarks
env:
  # TODO: this rescheduling makes gpt2, mixtral and llama unjitted slower
  # TODO: very slow for llama 70B and resnet training 6 GPU
  CAPTURE_PROCESS_REPLAY: "1"
  ASSERT_PROCESS_REPLAY: "0"
  PYTHONPATH: .
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

on:
  push:
    branches:
      - master
      - update_benchmark
      - update_benchmark_staging
  workflow_dispatch:

jobs:
  # the goal of this test is to replicate a normal person on a laptop running the test
  # no process replay, no benchmarks, no CI, just a normal laptop person
  # the 3 minute timeout should not be raised
  testmacpytest:
    name: Mac pytest
    env:
      CI: ""
      CAPTURE_PROCESS_REPLAY: "0"
    runs-on: [self-hosted, macOS]
    timeout-minutes: 3
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    # brew install uv
    - name: setup python environment
      run: |
        rm -rf /tmp/tinygrad_pytest_ci
        uv venv /tmp/tinygrad_pytest_ci
        source /tmp/tinygrad_pytest_ci/bin/activate
        uv pip install .[testing]
    - name: setup staging db
      run: |
        echo "CACHEDB=/tmp/pytest-db-ci.db" >> $GITHUB_ENV
        rm -f /tmp/pytest-db-ci*
    - name: Run pytest -nauto
      run: |
        source /tmp/tinygrad_pytest_ci/bin/activate
        pytest -nauto --durations=20
    - name: openpilot compile3 0.10.1 driving_vision
      run: FLOAT16=1 CL=1 IMAGE=2 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: IMAGE=1 openpilot compile3 0.10.1 driving_vision
      run: FLOAT16=1 CL=1 IMAGE=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx

  testmacbenchmark:
    name: Mac Benchmark
    env:
      # since sudo is required for usbgpu on macos, move the cache to a new location, as some of the files are owned by root
      PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache
    runs-on: [self-hosted, macOS]
    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        mkdir -p extra/disassemblers
        ln -s ~/tinygrad/extra/disassemblers/applegpu extra/disassemblers/applegpu
        ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: python3.11 test/external/process_replay/reset.py
    - name: Print macOS version
      run: sw_vers
    - name: Run Stable Diffusion
      run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing
    - name: Run Stable Diffusion without fp16
      run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=720 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing
    - name: Run Stable Diffusion v2
      # TODO: very slow step time
      run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 ASSERT_MIN_STEP_TIME=4500 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing
    # process replay can't capture this, the graph is too large
    - name: Run SDXL
      run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=5000 CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing
    - name: Run model inference benchmark
      run: METAL=1 NOCLANG=1 python3.11 test/external/external_model_benchmark.py
    - name: Test speed vs torch
      run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py
    - name: Test tensor cores
      run: METAL=1 python3.11 test/opt/test_tensor_cores.py
    - name: Test AMX tensor cores
      run: |
        DEBUG=2 CPU=1 CPU_LLVM=0 AMX=1 python3.11 test/opt/test_tensor_cores.py
        DEBUG=2 CPU=1 CPU_LLVM=1 AMX=1 python3.11 test/opt/test_tensor_cores.py
        DEBUG=2 CPU=1 CPU_LLVM=0 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
        DEBUG=2 CPU=1 CPU_LLVM=1 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx
    - name: Run Tensor Core GEMM (float)
      run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (half)
      run: DEBUG=2 SHOULD_USE_TC=1 HALF=1 python3.11 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (bfloat16)
      run: DEBUG=2 SHOULD_USE_TC=1 BFLOAT16=1 python3.11 extra/gemm/simple_matmul.py
    - name: Fuzz Padded Tensor Core GEMM
      run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py
    - name: Run LLaMA
      run: |
        BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
        BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
    - name: Run LLaMA with BEAM
      run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
    - name: Run quantized LLaMA
      run: |
        BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8
        BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4
    - name: Run quantized LLaMA3
      run: |
        BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8
        BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4
    #- name: Run LLaMA 7B on 4 (virtual) GPUs
    #  run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing
    - name: Run GPT2
      run: |
        BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing
        BENCHMARK_LOG=gpt2 JIT=1 ASSERT_MIN_STEP_TIME=13 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing
    - name: Run GPT2 w HALF
      run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing
    - name: Run GPT2 w HALF/BEAM
      run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing
    - name: Run OLMoE
      run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py
    - name: Train MNIST
      run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py

    # NOTE: this is failing in CI. it is not failing on my machine and I don't really have a way to debug it
    # the error is "RuntimeError: Internal Error (0000000e:Internal Error)"
    #- name: Run 10 CIFAR training steps
    #  run: BENCHMARK_LOG=cifar_10steps JIT=1 ASSERT_MIN_STEP_TIME=3000 STEPS=10 python3.11 examples/hlb_cifar10.py
    #- name: Run 10 CIFAR training steps w HALF
    #  run: BENCHMARK_LOG=cifar_10steps_half JIT=2 ASSERT_MIN_STEP_TIME=3000 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py

    #- name: Run 10 CIFAR training steps w BF16
    #  run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py
    # TODO: too slow
    # - name: Run 10 CIFAR training steps w winograd
    #   run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 ASSERT_MIN_STEP_TIME=150 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (Mac)
        path: |
          onnx_inference_speed.csv
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3.11 process_replay.py

  testusbgpu:
    name: UsbGPU Benchmark
    env:
      PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache
    runs-on: [self-hosted, macOS]
    timeout-minutes: 10
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: Kill stale pids
      run: |
        PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
        PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids
    - name: UsbGPU boot time
      run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
    - name: UsbGPU tiny tests
      run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/test_tiny.py
    - name: UsbGPU copy speeds
      run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds
    #- name: UsbGPU openpilot test
    #  run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
    - name: UsbGPU (USB4/TB) boot time
      run: PYTHONPATH=. DEBUG=3 NV=1 NV_IFACE=PCI NV_NAK=1 time python3.11 test/test_tiny.py TestTiny.test_plus
    - name: UsbGPU (USB4/TB) tiny tests
      run: PYTHONPATH=. NV=1 NV_IFACE=PCI NV_NAK=1 python3.11 test/test_tiny.py

  testnvidiabenchmark:
    name: tinybox green Benchmark
    runs-on: [self-hosted, Linux, tinyboxgreen]
    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Print nvidia-smi
      run: nvidia-smi
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
        ln -s /raid/weights/LLaMA-3 weights/LLaMA-3
        mkdir -p extra/datasets
        ln -s /raid/datasets/imagenet extra/datasets/imagenet
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    - name: Run model inference benchmark
      run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
    - name: Test speed vs torch
      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py
    - name: Test speed vs theoretical
      run: NV=1 IGNORE_BEAM_CACHE=1 CCACHE=0 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
    - name: Test benchmark allreduce
      run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py
    - name: Test tensor cores
      run: |
        NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
        NV=1 NV_PTX=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
    - name: Run Tensor Core GEMM (CUDA)
      run: |
        CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
        CUDA=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
        CUDA=1 SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
        CUDA=1 SHOULD_USE_TC=1 FP8E4M3=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (PTX)
      run: NV=1 NV_PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (NV)
      run: NV=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Test NV=1
      run: DEBUG=2 NV=1 python -m pytest -rA test/test_tiny.py
    - name: Test CUDA=1
      run: DEBUG=2 CUDA=1 python -m pytest -rA test/test_tiny.py
    - name: Run Stable Diffusion
      run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing
    # TODO: too slow
    # - name: Run SDXL
    #   run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=2000 CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing
    - name: Run LLaMA
      run: |
        BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
        BENCHMARK_LOG=llama NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
    - name: Run LLaMA with BEAM
      run: BENCHMARK_LOG=llama_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
    # - name: Run LLaMA 7B on 4 GPUs
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing
    # - name: Run LLaMA 7B on 6 GPUs
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing
    - name: Run LLaMA-3 8B BEAM
      run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0
    - name: Run LLaMA-3 8B on 4 GPUs with BEAM
      run: BENCHMARK_LOG=llama3_beam_4gpu NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0
    - name: Run quantized LLaMA3
      run: BENCHMARK_LOG=llama3_fp8 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --temperature 0 --benchmark --quantize fp8
    # - name: Run LLaMA-3 8B on 6 GPUs
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0
    # - name: Run LLaMA-2 70B
    #   run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing
    - name: Run Mixtral 8x7B
      run: time BENCHMARK_LOG=mixtral NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing
    - name: Run GPT2
      run: |
        BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing
        BENCHMARK_LOG=gpt2 NV=1 JIT=1 ASSERT_MIN_STEP_TIME=4 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing
    - name: Run GPT2 w HALF
      run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 ASSERT_MIN_STEP_TIME=6 python3 examples/gpt2.py --count 10 --temperature 0 --timing
    - name: Run GPT2 w HALF/BEAM
      run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing
    - uses: actions/upload-artifact@v4
      with:
        name: Speed (NVIDIA)
        path: |
          onnx_inference_speed.csv
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

  testmorenvidiabenchmark:
    name: tinybox green Training Benchmark
    runs-on: [self-hosted, Linux, tinyboxgreen]
    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
        mkdir -p extra/datasets
        ln -s /raid/datasets/imagenet extra/datasets/imagenet
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    # TODO: too slow
    # - name: Fuzz Padded Tensor Core GEMM (NV)
    #   run: NV=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
    # TODO: too slow
    # - name: Fuzz Padded Tensor Core GEMM (PTX)
    #   run: NV=1 NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
    - name: HEVC Decode Benchmark
      run: VALIDATE=1 MAX_FRAMES=100 ASSERT_FPS=1400 JITBEAM=1 NV=1 PYTHONPATH=. python3 extra/hevc/decode.py
    - name: Train MNIST
      run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py
    - name: Run 10 CIFAR training steps
      run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=120 NV=1 STEPS=10 python3 examples/hlb_cifar10.py
    - name: Run 10 CIFAR training steps w HALF
      run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=120 NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py
    - name: Run 10 CIFAR training steps w BF16
      run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=120 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py
    # - name: Run 10 CIFAR training steps w winograd
    #   run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=350 NV=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py
    - name: Run full CIFAR training w 1 GPU
      run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py
    - name: Run full CIFAR training steps w 6 GPUS
      run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py
    - name: Run MLPerf resnet eval on training data
      run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
      run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
      run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
      run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

  testamdbenchmark:
    name: tinybox red Benchmark
    runs-on: [self-hosted, Linux, tinybox]
    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setcap to python
      run: ./extra/amdpci/setup_python_cap.sh
    - name: Remove amd modules
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
    - name: Kill stale pids
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
    #- name: Insert amdgpu
    #  run: sudo modprobe amdgpu
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
        ln -s /raid/weights/LLaMA-3 weights/LLaMA-3
        mkdir -p extra/datasets
        ln -s /raid/datasets/imagenet extra/datasets/imagenet
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    #- name: setup perflevel
    #  run: |
    #    examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh
    #    rocm-smi
    #- name: Show off tinybox
    #  run: /opt/rocm/bin/rocm-bandwidth-test
    # TODO: unstable on AMD
    #- name: Run model inference benchmark
    #  run: LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 NOCLANG=1 python3 test/external/external_model_benchmark.py
    # TODO: unstable on AMD
    #- name: Test speed vs torch
    #  run: |
    #    python3 -c "import torch; print(torch.__version__)"
    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py
    - name: Test speed vs theoretical
      run: AMD=1 IGNORE_BEAM_CACHE=1 CCACHE=0 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
    - name: Test tensor cores AMD_LLVM=0
      run: AMD=1 AMD_LLVM=0 python3 test/opt/test_tensor_cores.py
    # TODO: this is flaky
    # - name: Test tensor cores AMD_LLVM=1
    #   run: AMD=1 AMD_LLVM=1 python3 test/opt/test_tensor_cores.py
    - name: Run Tensor Core GEMM (AMD)
      run: |
        AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
        AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
    - name: Test AMD=1
      run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py
    #- name: Test HIP=1
    #  run: DEBUG=2 HIP=1 python -m pytest -rA test/test_tiny.py
    # TODO: AMD compiler bug causes this to fail
    #- name: Fuzz Padded Tensor Core GEMM
    #  run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py
    #- name: Remove amdgpu
    #  run: sleep 10 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid.
    - name: Test AM cold start time
      run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Test AM warm start time
      run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Run Stable Diffusion
      run: BENCHMARK_LOG=stable_diffusion ASSERT_MIN_STEP_TIME=550 AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing
    - name: Run SDXL
      run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3200 CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing
    - name: Run LLaMA 7B
      run: |
        BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
        BENCHMARK_LOG=llama AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
    - name: Run LLaMA 7B with BEAM
      run: BENCHMARK_LOG=llama_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing
    # - name: Run LLaMA 7B on 4 GPUs
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0  --timing
    # - name: Run LLaMA 7B on 6 GPUs
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing
    - name: Run LLaMA-3 8B BEAM
      run: BENCHMARK_LOG=llama3_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0
    - name: Run LLaMA-3 8B on 4 GPUs with BEAM
      run: BENCHMARK_LOG=llama3_beam_4gpu AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0
    # - name: Run LLaMA-3 8B on 6 GPUs
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0
    #- name: Restore amdgpu
    #  run: sudo modprobe amdgpu
    # - name: Run LLaMA-2 70B
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing
    - name: Run Mixtral 8x7B
      run: time BENCHMARK_LOG=mixtral AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing
    - name: Run GPT2
      run: |
        BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing
        BENCHMARK_LOG=gpt2 AMD=1 JIT=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing
    - name: Run GPT2 w HALF
      run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --count 10 --temperature 0 --timing
    - name: Run GPT2 w HALF/BEAM
      run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

  testmoreamdbenchmark:
    name: tinybox red Training Benchmark
    runs-on: [self-hosted, Linux, tinybox]
    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setcap to python
      run: ./extra/amdpci/setup_python_cap.sh
    - name: Remove amd modules
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
    - name: Kill stale pids
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
        mkdir -p extra/datasets
        ln -s /raid/datasets/imagenet extra/datasets/imagenet
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    - name: Train MNIST
      run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py
    - name: Run 10 CIFAR training steps
      run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=200 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py
    - name: Run 10 CIFAR training steps w HALF
      run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=230 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py
    # - name: Run 10 CIFAR training steps w BF16
    #   run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py
    # TODO: too slow
    # - name: Run 10 CIFAR training steps w winograd
    #   run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py
    - name: Run full CIFAR training w 1 GPU
      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py
    - name: Run full CIFAR training steps w 6 GPUS
      run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py
    # TODO: broken on some of the machines
    #- name: Test full tinyfs load
    #  run: TINYFS_ENDPOINT=10.0.52.11:6767 PYTHONPATH=. python extra/tinyfs/fetch_file.py --hash d734f5e3be9f1e9d863bfaa4fc6c1ef2 --len 175866113 --dest mapping.json --check
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

  testmlperfamdbenchmark:
    name: tinybox red MLPerf Benchmark
    runs-on: [self-hosted, Linux, tinybox]
    timeout-minutes: 60
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setcap to python
      run: ./extra/amdpci/setup_python_cap.sh
    - name: Remove amd modules
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
    - name: Kill stale pids
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
        mkdir -p extra/datasets
        ln -s /raid/datasets/imagenet extra/datasets/imagenet
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    - name: Run MLPerf resnet eval
      run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
      run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py
    - name: Run 10 MLPerf ResNet50 training steps (6 gpu)
      run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py
    - name: Run 10 MLPerf Bert training steps (6 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
      run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=72 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

  testqualcommbenchmark:
    name: comma Benchmark
    runs-on: [self-hosted, Linux, comma]
    timeout-minutes: 20
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    - name: openpilot compile3 0.10.0 driving_policy
      run: BENCHMARK_LOG=openpilot_0_10_0_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=3 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.10.0/selfdrive/modeld/models/driving_policy.onnx
    - name: openpilot compile3 0.10.0 dmonitoring
      run: BENCHMARK_LOG=openpilot_0_10_0_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=11 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.10.0/selfdrive/modeld/models/dmonitoring_model.onnx
    - name: DEBUG=2 openpilot compile3 0.10.1 driving_vision
      run: PYTHONPATH="." DEBUG=2 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: DEBUG=2 IMAGE=1 openpilot compile3 0.10.1 driving_vision
      run: PYTHONPATH="." DEBUG=2 DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: IMAGE=1 openpilot compile3 0.10.1 driving_vision
      run: BENCHMARK_LOG=image_1_openpilot_0_10_1_vision PYTHONPATH="." DEV=QCOM FLOAT16=1 IMAGE=1 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: openpilot compile3 0.10.1 driving_vision
      run: BENCHMARK_LOG=openpilot_0_10_1_vision PYTHONPATH="." ASSERT_MIN_STEP_TIME=17 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: openpilot compile3 0.10.1 driving_policy
      run: BENCHMARK_LOG=openpilot_0_10_1_policy PYTHONPATH="." ASSERT_MIN_STEP_TIME=3 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_policy.onnx
    - name: openpilot compile3 0.10.1 dmonitoring
      run: BENCHMARK_LOG=openpilot_0_10_1_dmonitoring PYTHONPATH="." ASSERT_MIN_STEP_TIME=11 DEV=QCOM FLOAT16=1 IMAGE=2 NOLOCALS=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/dmonitoring_model.onnx
    - name: benchmark MobileNetV2 on DSP
      run: |
        # generate quantized weights
        ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet
        ln -s /data/home/tiny/tinygrad/testsig-*.so .
        PYTHONPATH=. CC=clang-19 CPU=1 CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx
        # benchmark on DSP with NOOPT=1, the devectorizer has issues
        PYTHONPATH=. CC=clang-19 DSP=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

  testcommausbgpubenchmark:
    name: UsbGPU Benchmark (comma)
    runs-on: [self-hosted, Linux, comma4]
    timeout-minutes: 20
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: openpilot compile3 0.10.1 driving_vision
      run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision PYTHONPATH="." DEV=AMD AMD_LLVM=1 AMD_IFACE=USB ASSERT_MIN_STEP_TIME=50 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/720392c9a5b986981fdbed1bb8c47a6c5573a50e/selfdrive/modeld/models/driving_vision.onnx
    - name: openpilot load_pickle 0.10.1 driving_vision
      run: BENCHMARK_LOG=usbgpu_openpilot_0_10_1_vision_load_pickle PYTHONPATH="." DEV=AMD AMD_IFACE=USB ASSERT_MIN_LOAD_TIME=15 python3 examples/openpilot/load_pickle.py

  testreddriverbenchmark:
    name: AM Benchmark
    runs-on: [self-hosted, Linux, tinyboxrandom]
    timeout-minutes: 20
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setcap to python
      run: ./extra/amdpci/setup_python_cap.sh
    - name: Remove amd modules
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
    - name: Kill stale pids
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
        mkdir -p extra/datasets
        ln -s /raid/datasets/imagenet extra/datasets/imagenet
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    - name: Test driver cold start time
      run: time DEBUG=3 AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Test driver warm start time
      run: time DEBUG=3 AMD=1 python3 test/test_tiny.py TestTiny.test_plus
    # Fails on 9070
    # - name: Test tensor cores
    #   run: |
    #     AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
    #     AMD=1 AMD_LLVM=1 python3 test/test_linearizer.py test/opt/test_tensor_cores.py
    #     AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py
    - name: Run Tensor Core GEMM (AMD)
      run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py
    - name: Test AMD=1
      run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py
    - name: Test DISK copy time
      run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
    - name: Test CPU copy time
      run: |
        AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
        AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
    - name: Run full CIFAR training w 1 GPU
      run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py
    # - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
    #   run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py
    - name: Run 10 MLPerf Bert training steps (1 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
      run: BENCHMARK_LOG=bert_10steps AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py

  testgreendriverbenchmark:
    name: NV Benchmark
    runs-on: [self-hosted, Linux, tinyboxrandom]
    timeout-minutes: 20
    defaults:
      run:
        shell: bash -e -o pipefail {0}
    if: github.repository_owner == 'tinygrad'
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setcap to python
      run: ./extra/amdpci/setup_python_cap.sh
    - name: Remove nv modules
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv rmmod
    - name: Kill stale pids
      run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids
    - name: Symlink models and datasets
      run: |
        mkdir -p weights
        ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz
        ln -s ~/tinygrad/weights/LLaMA weights/LLaMA
        ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz
        ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen
        ln -s /raid/weights/LLaMA-2 weights/LLaMA-2
        mkdir -p extra/datasets
        ln -s /raid/datasets/imagenet extra/datasets/imagenet
    - name: setup staging db
      if: github.ref == 'refs/heads/update_benchmark_staging'
      run: |
        echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
        rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
    - name: reset process replay
      run: test/external/process_replay/reset.py
    - name: Test driver start time
      run: time DEBUG=3 NV=1 python3 test/test_tiny.py TestTiny.test_plus
    - name: Test tensor cores
      run: NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
    - name: Test DISK copy time
      run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py
    - name: Test CPU copy time
      run: |
        NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit
        NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit
    - name: Test LLAMA-3
      run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0
    - name: Run full CIFAR training w 1 GPU
      run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF STEPS=1000 TARGET_EVAL_ACC_PCT=93.0 python3 examples/hlb_cifar10.py
    - name: Run 10 MLPerf ResNet50 training steps (1 gpu)
      run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py
    - name: Run 10 MLPerf Bert training steps (1 gpu)
      # TODO: remove BERT_LAYERS once scheduler is fast
      run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py
    - name: Run process replay tests
      run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py