diff --git a/README.md b/README.md index e93898f1..186d727e 100644 --- a/README.md +++ b/README.md @@ -45,13 +45,13 @@ We are now shipping **OSS kernels**, allowing you to inspect, modify, and contri #### Llama 3.1 style Forward and Bprop with causal masking (GB300)

- Llama 3.1 SDPA Benchmark on GB300 (only cuDNN) + Llama 3.1 SDPA Benchmark on GB300 (only cuDNN)

#### Deepseek v3 style Forward and Bprop with causal masking (GB300)

- DSv3 SDPA Benchmark on GB300 (only cuDNN) + DSv3 SDPA Benchmark on GB300 (only cuDNN)

## Key Features diff --git a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/dsv3_20260226_174854.csv b/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/dsv3_20260226_174854.csv deleted file mode 100644 index 868eb174..00000000 --- a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/dsv3_20260226_174854.csv +++ /dev/null @@ -1,11 +0,0 @@ -config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,sliding_window_size,success,error_message,gpu_name,cudnn_version,cudnn_backend_version -dsv3,dsv3,cudnn,bfloat16,top_left,1,32768,32768,128,128,192,128,both,True,24.232,88.716,1815.000,1289.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,32768,32768,128,128,192,128,both,True,16.924,64.250,2599.000,1780.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,16384,16384,128,128,192,128,both,True,6.318,21.715,1740.000,1317.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,16384,16384,128,128,192,128,both,True,4.475,16.593,2457.000,1723.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,8192,8192,128,128,192,128,both,True,1.800,5.728,1527.000,1248.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,8192,8192,128,128,192,128,both,True,1.235,4.544,2227.000,1573.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,4096,4096,128,128,192,128,both,True,0.509,1.613,1350.000,1108.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,4096,4096,128,128,192,128,both,True,0.359,1.310,1917.000,1365.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,2048,2048,128,128,192,128,both,True,0.161,0.508,1065.000,880.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,2048,2048,128,128,192,128,both,True,0.118,0.411,1455.000,1088.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 diff --git a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/dsv3_top_left.png b/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/dsv3_top_left.png deleted file mode 100644 index e450054f..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/dsv3_top_left.png and /dev/null differ diff --git a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/gpt_oss_20260226_175022.csv b/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/gpt_oss_20260226_175022.csv deleted file mode 100644 index c7b10480..00000000 --- a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/gpt_oss_20260226_175022.csv +++ /dev/null @@ -1,11 +0,0 @@ -config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,sliding_window_size,success,error_message,gpu_name,cudnn_version,cudnn_backend_version -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,32768,32768,64,8,64,64,both,False,1.711,4.099,632.000,660.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,32768,32768,64,8,64,64,both,False,1.667,43.921,649.000,62.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,16384,16384,64,8,64,64,both,False,0.853,2.024,624.000,658.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,16384,16384,64,8,64,64,both,False,0.831,11.332,641.000,117.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,8192,8192,64,8,64,64,both,False,0.423,0.998,609.000,646.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,8192,8192,64,8,64,64,both,False,0.411,3.011,626.000,214.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,4096,4096,64,8,64,64,both,False,0.209,0.487,574.000,617.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,4096,4096,64,8,64,64,both,False,0.203,0.862,591.000,349.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,2048,2048,64,8,64,64,both,False,0.104,0.230,497.000,559.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,2048,2048,64,8,64,64,both,False,0.101,0.271,512.000,476.000,0.000,10,1024,True,,NVIDIA GB200,1.19.0,91901 diff --git a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/gpt_oss_top_left.png b/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/gpt_oss_top_left.png deleted file mode 100644 index 996c8006..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/gpt_oss_top_left.png and /dev/null differ diff --git a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_20260226_174648.csv b/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_20260226_174648.csv deleted file mode 100644 index 991af86e..00000000 --- a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_20260226_174648.csv +++ /dev/null @@ -1,21 +0,0 @@ -config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,sliding_window_size,success,error_message,gpu_name,cudnn_version,cudnn_backend_version -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,32768,32768,64,8,128,128,both,False,10.226,29.863,1720.000,1473.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,32768,32768,64,8,128,128,both,False,19.759,58.606,1781.000,1501.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,32768,32768,64,8,128,128,both,False,7.960,25.803,2210.000,1705.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,32768,32768,64,8,128,128,both,False,15.848,49.473,2220.000,1778.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,16384,16384,64,8,128,128,both,False,2.606,7.845,1687.000,1402.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,16384,16384,64,8,128,128,both,False,4.972,15.111,1769.000,1455.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,16384,16384,64,8,128,128,both,False,2.060,6.764,2135.000,1626.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,16384,16384,64,8,128,128,both,False,4.001,12.683,2199.000,1734.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,8192,8192,64,8,128,128,both,False,0.688,2.097,1598.000,1311.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,8192,8192,64,8,128,128,both,False,1.286,3.901,1709.000,1409.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,8192,8192,64,8,128,128,both,False,0.553,1.860,1987.000,1478.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,8192,8192,64,8,128,128,both,False,1.050,3.372,2094.000,1630.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,4096,4096,64,8,128,128,both,False,0.206,0.607,1335.000,1132.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,4096,4096,64,8,128,128,both,False,0.345,1.068,1593.000,1287.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,4096,4096,64,8,128,128,both,False,0.161,0.559,1707.000,1229.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,4096,4096,64,8,128,128,both,False,0.276,0.938,1995.000,1465.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,2048,2048,64,8,128,128,both,False,0.066,0.206,1042.000,836.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,2048,2048,64,8,128,128,both,False,0.110,0.315,1245.000,1089.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,2048,2048,64,8,128,128,both,False,0.053,0.192,1286.000,897.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,2048,2048,64,8,128,128,both,False,0.085,0.287,1615.000,1198.000,0.000,10,,True,,NVIDIA GB200,1.19.0,91901 diff --git a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_no_mask.png b/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_no_mask.png deleted file mode 100644 index 5d921c88..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_no_mask.png and /dev/null differ diff --git a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_top_left.png b/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_top_left.png deleted file mode 100644 index 03865f69..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb200_919_only_cudnn/llama3.1_top_left.png and /dev/null differ diff --git a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/dsv3_20260226_185831.csv b/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/dsv3_20260226_185831.csv deleted file mode 100644 index b56b8cd9..00000000 --- a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/dsv3_20260226_185831.csv +++ /dev/null @@ -1,11 +0,0 @@ -config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,sliding_window_size,success,error_message,gpu_name,cudnn_version,cudnn_backend_version -dsv3,dsv3,cudnn,bfloat16,top_left,1,32768,32768,128,128,192,128,both,True,21.483,81.498,2047.000,1403.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,32768,32768,128,128,192,128,both,True,12.976,55.846,3389.000,2048.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,16384,16384,128,128,192,128,both,True,5.644,20.522,1948.000,1393.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,16384,16384,128,128,192,128,both,True,3.420,14.642,3215.000,1953.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,8192,8192,128,128,192,128,both,True,1.548,5.432,1776.000,1316.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,8192,8192,128,128,192,128,both,True,0.951,4.108,2890.000,1740.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,4096,4096,128,128,192,128,both,True,0.454,1.544,1515.000,1158.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,4096,4096,128,128,192,128,both,True,0.285,1.197,2410.000,1493.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,bfloat16,top_left,1,2048,2048,128,128,192,128,both,True,0.148,0.490,1165.000,912.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -dsv3,dsv3,cudnn,fp8,top_left,1,2048,2048,128,128,192,128,both,True,0.107,0.380,1603.000,1175.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 diff --git a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/dsv3_top_left.png b/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/dsv3_top_left.png deleted file mode 100644 index 518ea62c..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/dsv3_top_left.png and /dev/null differ diff --git a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/gpt_oss_20260226_185932.csv b/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/gpt_oss_20260226_185932.csv deleted file mode 100644 index c89bab76..00000000 --- a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/gpt_oss_20260226_185932.csv +++ /dev/null @@ -1,11 +0,0 @@ -config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,sliding_window_size,success,error_message,gpu_name,cudnn_version,cudnn_backend_version -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,32768,32768,64,8,64,64,both,False,1.448,3.930,748.000,689.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,32768,32768,64,8,64,64,both,False,1.204,39.694,899.000,68.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,16384,16384,64,8,64,64,both,False,0.722,1.951,738.000,682.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,16384,16384,64,8,64,64,both,False,0.600,10.265,887.000,130.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,8192,8192,64,8,64,64,both,False,0.359,0.965,718.000,668.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,8192,8192,64,8,64,64,both,False,0.299,2.752,863.000,234.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,4096,4096,64,8,64,64,both,False,0.178,0.472,675.000,638.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,4096,4096,64,8,64,64,both,False,0.148,0.792,811.000,379.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,bfloat16,top_left,2,2048,2048,64,8,64,64,both,False,0.089,0.221,581.000,583.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 -gpt_oss,gpt_oss,cudnn,fp8,top_left,2,2048,2048,64,8,64,64,both,False,0.074,0.253,694.000,509.000,0.000,10,1024,True,,NVIDIA GB300,1.19.0,91901 diff --git a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/gpt_oss_top_left.png b/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/gpt_oss_top_left.png deleted file mode 100644 index c43787ee..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/gpt_oss_top_left.png and /dev/null differ diff --git a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_20260226_185510.csv b/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_20260226_185510.csv deleted file mode 100644 index f367ac86..00000000 --- a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_20260226_185510.csv +++ /dev/null @@ -1,21 +0,0 @@ -config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,sliding_window_size,success,error_message,gpu_name,cudnn_version,cudnn_backend_version -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,32768,32768,64,8,128,128,both,False,8.734,28.390,2014.000,1549.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,32768,32768,64,8,128,128,both,False,18.719,56.571,1880.000,1555.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,32768,32768,64,8,128,128,both,False,5.623,23.752,3129.000,1852.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,32768,32768,64,8,128,128,both,False,11.145,45.673,3157.000,1926.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,16384,16384,64,8,128,128,both,False,2.235,7.380,1968.000,1490.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,16384,16384,64,8,128,128,both,False,4.415,14.110,1992.000,1559.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,16384,16384,64,8,128,128,both,False,1.457,6.253,3019.000,1759.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,16384,16384,64,8,128,128,both,False,2.805,11.790,3136.000,1865.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,8192,8192,64,8,128,128,both,False,0.583,1.985,1887.000,1385.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,8192,8192,64,8,128,128,both,False,1.140,3.663,1930.000,1501.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,8192,8192,64,8,128,128,both,False,0.396,1.730,2780.000,1590.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,8192,8192,64,8,128,128,both,False,0.734,3.153,2997.000,1744.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,4096,4096,64,8,128,128,both,False,0.164,0.575,1672.000,1195.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,4096,4096,64,8,128,128,both,False,0.294,1.016,1872.000,1353.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,4096,4096,64,8,128,128,both,False,0.118,0.525,2324.000,1309.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,4096,4096,64,8,128,128,both,False,0.193,0.880,2855.000,1562.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,top_left,1,2048,2048,64,8,128,128,both,False,0.054,0.190,1266.000,906.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,2048,2048,64,8,128,128,both,False,0.088,0.294,1568.000,1168.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,top_left,1,2048,2048,64,8,128,128,both,False,0.042,0.182,1636.000,945.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 -llama3.1,llama3.1,cudnn,fp8,no_mask,1,2048,2048,64,8,128,128,both,False,0.060,0.273,2277.000,1258.000,0.000,10,,True,,NVIDIA GB300,1.19.0,91901 diff --git a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_no_mask.png b/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_no_mask.png deleted file mode 100644 index eeace4f1..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_no_mask.png and /dev/null differ diff --git a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_top_left.png b/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_top_left.png deleted file mode 100644 index 4150627c..00000000 Binary files a/benchmark/sdpa_benchmark_training/results/gb300_919_only_cudnn/llama3.1_top_left.png and /dev/null differ